简单的通过异步来爬取必应搜索结果,速度非常可观。 通过用 aiohttp, asyncio这两个异步模块,再通过xpath来提取链接。 加个cookie可以防止爬虫被禁
import aiohttp import asyncio from lxml import etree headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/81.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2', 'Accept-Encoding': 'gzip, deflate', 'Cookie': 'BAIDUID=1A6EF88EE4929836C761FB37A1303522:FG=1; BIDUPSID=1A6EF88EE4929836C761FB37A1303522; PSTM=1603199415; H_PS_PSSID=32755_1459_32877_7567_31253_32706_32231_7517_32117_32845_32761_26350; BD_UPN=13314752; BDORZ=B490B5EBF6F3CD402E515D22BCDA1598; delPer=0; BD_CK_SAM=1; PSINO=5; H_PS_645EC=e4bcE4275G3zWcvH2pxYG6R32rBxb5yuey8xcioaej8V7IaJRfEq4xp4iCo; COOKIE_SESSION=45294_0_2_5_0_2_0_1_0_2_3_0_0_0_0_0_0_0_1603244844%7C5%230_0_1603244844%7C1; BA_HECTOR=2gal2h2ga58025f1vs1fov5vf0k'} async def url(): async with aiohttp.ClientSession() as session: for i in range(1,100):#通过for in来翻页 url = 'https://cn.bing.com/search?q=site%3aedu.cn&go=%e6%90%9c%e7%b4%a2&qs=ds&first='+ str((i*10)-1) +'&FORM=PERE' try: async with session.get(url,headers = headers) as resp: r = await resp.text() a = etree.HTML(r) xpath = a.xpath('//*[@id="b_results"]/li/h2/a/@href')#提取url连接 for i in xpath: print(i) except: print('无法连接') asyncio.run(url())