同样的在代码部分dic_c和dic_h部分做了删减处理,请使用的同学替换为自己电脑的heager和cookies,查看路径newwork–doc-刷新页面–name找到右边的header和cookies,相信聪明的你百度下很快能找到的。
第一部分内容为源代码构建,爬取的是总平台数据信息,结果来看,爬取成功了92%以上的数据,未成功采集的网址,暂时未做深究了。 还有第二部分哈。
import requests from bs4 import BeautifulSoup import pandas as pd import random,time ''' 需求,导出网贷之家的平台数据 ''' def get_url(ui,n): ''' :param n: :return: ''' #ui = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage={}' urllst = [] for i in range(1,n): urllst.append(ui.format(i)) return urllst def get_date(url,dic_h,dic_cook): ''' :param url: :param headers: :param cookies: :return: ''' ri = requests.get(url,headers=dic_h,cookies=dic_cook) soup = BeautifulSoup(ri.text,'lxml') lis = soup.find('ul',class_="terraceList").find_all('li',class_="item") datalst = [] for li in lis: dic = {} dic['name'] = li.find('h2').find('a').text.replace(' ','').replace('\n','') # div块内的em标签内容获取 divs = li.find('div',class_="itemTitle").find_all('div',class_="itemTitleTag tag") lst = [] for div in divs: lst.append(div.find('em').text) dic['types'] = lst # divbox 块内所有元素的获取 partones= li.find('a',class_="itemConLeft").find_all('div',class_="itemConBox") part1 = [] for i in partones: part1.append(i.text.replace(' ','').replace('\n','').replace('\t','').replace('\r','')) dic['part1'] = part1 datalst.append(dic) #print(lis[0].prettify()) #print(datalst) #print(len(lis)) return datalst if __name__ == '__main__': # s1 获取待爬取网页地址 ui = 'https://www.wdzj.com/dangan/search?filter=&sort=1¤tPage={}' urls = get_url(ui,265) print(urls[0]) # s2 获取网址的数据信息 dic_h = {'User-AgML, like Gecko) Chrome/86.0.4240.75 Safari/537.36'} cooks = """__jsluid_s=8c0d38a7eb6fc3afde6b071b93a502cb; PHPSESSID=s2md6ancu6ispqaftookh5q5c4; WDZJptlbs=1; gr_user_id=bc43ce37-69cf-4e66-8ac9-62bca6b6976a; _rebate_csrfbf34301ab4dff82efbd8f; _ga=GA1.2.1877523745.1603062582; _gid=GA1.2.1704066258.1603062582; Z6wq_e6e3_saltkey=m92C2IV2; Z6wq_e6e3_auth=f4ddWOBNrNGoa5C7Rj9GAgCp0TqQzler7Ht4d0OL3NHs8k29y6eS%2BDZZ6JtoU8l%2Bll1ACSyfKylWiNnGbxBkVoMuelq7; auth_token=4f3eK%2B%2FaJugDcpveAdF8aUJeD%2FrQ4eQHV6OOCYImNda2PlkFb95nyQ12iApUbadWukRbU2hFNo%2F%2BB0m19bDMUynNSGrO; uid=2025289; login_channel=1; pc_login=1; gr_cs1_83f8fe36-234c-4b43-8b0c-089a3dbc01b0=user_id%3A2025289; Hm_lvt_9e837711961994d9830dcd3f4b45f0b3=1603062522,1603062643; WDZJ_FRONT_SESSION_ID=c0e6384b96d7438db6fd147848ccad0139024320909301639; Z6wq_e6e3_ulastactivity=27c8QzpyZsaj%2Fi%2BJ5r%2BENX2xNz%2BPbiY%2F20WsBjaIr_cs1_27fcdb36-02b3-403e-bb55-e7c3e2eed5f6=user_id%3A2025289; _gat=1; Hm_lpvt_9e837711961994d9830dcd3f4b45f0b3=1603064458; gr_session_id_1931ea22324b4036a653ff1d3a0b4693_27fcdb36-02b3-403e-bb55-e7c3e2eed5f6=true""" dic_cook = {} for co in cooks.split('; '): dic_cook[co.split('=')[0]] = co.split('=')[1] #get_date(urls[0], dic_h, dic_cook) datalst = [] errurl = [] for url in urls: time.sleep(random.randint(1,3)) try: datalst.extend(get_date(url,dic_h,dic_cook)) print('成功采集%d条数据' % len(datalst)) except: errurl.append(url) print('采集失败,失败网址为:',url) print('\n') df = pd.DataFrame(datalst) print(df.iloc[1]) df.to_csv('web_loan.csv')第二部分代码为上本部分,代码的引用。顺便爬取了当前运营平台的信息。成功率要符合预期。csv数据处理,属于pandas部分去做了。没有计划在本次练习的范围内。如有帮助,深感荣幸。
最后附上两部分csv数据的截图,老规矩。meitushuogep