1.使用selenium工具进入36kr首页
from selenium import webdriver chrome_driver=r"C:\Users\yandi\AppData\Local\Programs\Python\Python37-32\chromedriver.exe" driver=webdriver.Chrome(executable_path=chrome_driver) driver.get('https://36kr.com/')[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-xD8cn0xO-1603186587692)(C:\Users\yandi\AppData\Roaming\Typora\typora-user-images\image-20201019155906798.png)]
2.获取资讯专栏
#获取资讯栏 driver.find_element_by_xpath('//*[@id="information"]').click()[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-wL2MwEDZ-1603186587694)(C:\Users\yandi\AppData\Roaming\Typora\typora-user-images\image-20201019160008281.png)]
3.获取信息
#%%获取每一条资讯的信息 typelist,authorlist,titlelist = [],[],[] i = 1 while(True): try: type = driver.find_element_by_xpath( '//div[@class="information-flow-list"]/div['+str(i)+']//span[@class="kr-flow-bar-motif"]/a').text typelist.append(type) author = driver.find_element_by_xpath( '//div[@class="information-flow-list"]/div['+str(i)+']//a[@class="kr-flow-bar-author"]').text authorlist.append(author) title = driver.find_element_by_xpath( '//div[@class="information-flow-list"]/div['+str(i)+']//a[@class="article-item-title weight-bold"]').text titlelist.append(title) i = i + 1 if ( i % 29 == 0): print("第"+str(i/29)+"次刷新页面,请稍等,目前获取了"+str(i)+"条数据") driver.find_element_by_xpath('//*[@id="app"]/div/div[1]/div[3]/div/div/div[1]/div/div/div[3]').click() except: print("wating") if i > 301: print("爬取完成,共获取"+str(i)+"条数据") break[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-JSUDdDzt-1603186587695)(C:\Users\yandi\AppData\Roaming\Typora\typora-user-images\image-20201019160150865.png)]
4.合并数据
#%% 和并数据 import pandas as pd frame_title = pd.DataFrame(titlelist, columns=['title']) frame_type = pd.DataFrame(typelist, columns=['type']) frame_author = pd.DataFrame(authorlist, columns=['author']) info_frame = frame_type.join(frame_title).join(frame_author)[外链图片转存失败,源站可能有防盗链机制,建议将图片保存下来直接上传(img-aId2ca5I-1603186587697)(C:\Users\yandi\AppData\Roaming\Typora\typora-user-images\image-20201019160406992.png)]
5.序列化写入
#%% 序列化写入 import pickle b = open(r"C:\Users\yandi\PycharmProjects\MachineLearing\36氪\info_frame.pkl", "wb") pickle.dump(info_frame,b) b.close()