import urllib
.parse
import urllib
.request
import re
from bs4
import BeautifulSoup
import xlwt
def main():
url
='https://movie.douban.com/top250?start='
datalist
=get_data
(url
)
savepath
='.//豆瓣电影TOP250链接.xls'
saveData
(datalist
,savepath
)
findLink
=re
.compile(r
'<a href="(.*?)">')
def get_data(url
):
datalist
=[]
for i
in range(0,10):
baseurl
=url
+str(i
*25)
html
=askurl
(baseurl
)
soup
=BeautifulSoup
(html
,'html.parser')
for item
in soup
.find_all
('div',class_
='item'):
data
=[]
item
=str(item
)
link
=re
.findall
(findLink
,item
)[0]
data
.append
(link
)
datalist
.append
(data
)
return datalist
def saveData(datalist
,savepath
):
book
=xlwt
.Workbook
(encoding
='utf-8')
sheet
=book
.add_sheet
('豆瓣电影')
col
=('电影链接')
sheet
.write
(0,0,col
)
for i
in range(0,250):
print('第%d条'%(i
+1))
data
=datalist
[i
]
sheet
.write
(i
+1,0,data
[0])
book
.save
(savepath
)
def askurl(baseurl
):
headers
={
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
}
request
=urllib
.request
.Request
(baseurl
,headers
=headers
)
html
=''
response
=urllib
.request
.urlopen
(request
)
html
=response
.read
().decode
('utf-8')
return html
if __name__
=='__main__':
main
()
www.dan
转载请注明原文地址: https://lol.8miu.com/read-31005.html