spider.简洁版(豆瓣电影TOP250链接)

it2025-10-31  1

import urllib.parse import urllib.request import re from bs4 import BeautifulSoup import xlwt #主要函数 def main(): url='https://movie.douban.com/top250?start=' datalist=get_data(url) savepath='.//豆瓣电影TOP250链接.xls' saveData(datalist,savepath) findLink=re.compile(r'<a href="(.*?)">') #提取数据 def get_data(url): datalist=[] for i in range(0,10): baseurl=url+str(i*25) html=askurl(baseurl) soup=BeautifulSoup(html,'html.parser') for item in soup.find_all('div',class_='item'): data=[] item=str(item) link=re.findall(findLink,item)[0] data.append(link) datalist.append(data) return datalist def saveData(datalist,savepath): book=xlwt.Workbook(encoding='utf-8') sheet=book.add_sheet('豆瓣电影') col=('电影链接') sheet.write(0,0,col) for i in range(0,250): print('第%d条'%(i+1)) data=datalist[i] sheet.write(i+1,0,data[0]) book.save(savepath) def askurl(baseurl): headers={ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36" } request=urllib.request.Request(baseurl,headers=headers) html='' response=urllib.request.urlopen(request) html=response.read().decode('utf-8') return html if __name__=='__main__': main()

www.dan

最新回复(0)