python爬虫学习(requests模块)

it2025-04-08 37

requests模块介绍:

- python中原生的一款基于网络请求的模块，功能非常强大，简单便捷，效率极高。作用：模拟浏览器发请求。

如何使用：(requests模块的编码流程) - 指定url - 发起请求 - 获取响应数据 - 持久化存储

环境安装: - pip install requests

练习： 1.爬取搜狗首页的页面数据

# 需求：爬取搜狗首页数据 import requests # -指定url url = 'https://www.sogou.com/' # 发起请求 response = requests.get(url=url) # 获取响应数据 page_text = response.text print(page_text) # 持久化存储 with open('./sogou.html', 'w', encoding='utf-8') as fp: fp.write(page_text)

练习巩固： 1.爬取搜狗指定词条对应的搜索结果页面(简易网页采集器)(https://www.sogou.com)

UA检测 import requests # 指定UA,通过UA检测 headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0' } # 指定url url = 'https://www.sogou.com/web' # 处理url中携带的参数 kw = input("请输入要查询的内容:") param = { 'query':kw } # 发送请求 page_text = requests.get(url=url,params=param,headers=headers).text # print(page_text) # 持久化存储得到的数据 filename = kw+'.html' with open(filename,'w',encoding='utf-8') as fp: fp.write(page_text) print(filename,'保存成功')

2.爬取百度翻译(https://fanyi.baidu.com)

post请求(携带了参数)响应数据是一组json数据 import requests post_url = 'https://fanyi.baidu.com/sug' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0' } kw = input("请输入要翻译的内容:") data = { 'kw':kw } page_json = requests.post(url=post_url,data=data,headers=headers).json() print(page_json)

3.爬取豆瓣电影分类排行榜中的电影详情数据(https://movie.douban.com)

json.dump import requests import json # 指定url url = 'https://movie.douban.com/j/chart/top_list' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0' } param = { 'type': '11', 'interval_id': '100:90', 'action': '', # 从当前类型第几部电影开始取 'start': '0', # 一次取出多少部 'limit': '20', } page_json = requests.get(url=url, params=param, headers=headers).json() # print(page_json) fp = open('./douban.json', 'w', encoding='utf-8') json.dump(page_json, fp=fp, ensure_ascii=False)

4.爬取肯德基餐厅地址查询(http://www.kfc.com.cn/kfccda/)

import requests import json url = 'http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx?op=keyword' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0' } kw = input('请输入要查询的地址:') page = input('请输入查询的页数:') data = { 'cname': '', 'pid': '', 'keyword': kw, 'pageIndex': page, 'pageSize': '10', } page_json = requests.post(url=url, data=data, headers=headers).json() page_txet = requests.post(url=url,data=data,headers=headers).text fileName = kw + page + '.html' with open(fileName,'w',encoding='utf-8') as fp: fp.write(page_txet) # fileName = kw + page + '.json' # with open(fileName, 'w', encoding='utf-8') as fp: # json.dump(page_json, fp=fp, ensure_ascii=False) print('查询完毕')

5.爬取中华人民共和国化妆品生产许可证相关数据(http://scxk.nmpa.gov.cn:81/xk/)

动态加载数据首页中对应的企业信息是通过ajax动态请求到的通过对详情页的url观察发现：url的域名是一样的，只有携带的参数不一样参数可以从首页对应的ajax请求到的json串中获取域名和参数拼接出一个完整的企业对应的详情页的url详情页的企业详情数据也是动态加载的 import requests import json url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsList' headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0' } # 存储企业ID id_list = [] # 存储所有企业详情数据 all_data_list = [] # 分页操作 for page in range(1): page = str(page) data = { 'on': 'true', 'page': page, 'pageSize': '15', 'productName': '', 'conditionType': '1', 'applyname': '', 'applysn': '', } # 获取id ids_json = requests.post(url=url, headers=headers, data=data).json() # print(ids_json) for dic in ids_json['list']: id_list.append(dic['ID']) # print(id_list) post_url = 'http://scxk.nmpa.gov.cn:81/xk/itownet/portalAction.do?method=getXkzsById' for id in id_list: data = { 'id': id } data_json = requests.post( url=post_url, headers=headers, data=data).json() # print(data_json) all_data_list.append(data_json) print(all_data_list) #持久化存储 fp = open('./allData.json', 'w', encoding='utf-8') json.dump(all_data_list, fp=fp, ensure_ascii=False) print('第', page, '页爬取完毕')

最新回复(0)