import urllib
.parse
import urllib
.request
import re
from bs4 
import BeautifulSoup
import xlwt
def main():
    url
='https://movie.douban.com/top250?start='
    datalist
=get_data
(url
)
    savepath
='.//豆瓣电影TOP250链接.xls'
    saveData
(datalist
,savepath
)
findLink
=re
.compile(r
'<a href="(.*?)">')
def get_data(url
):
    datalist
=[]
    for i 
in range(0,10):
        baseurl
=url
+str(i
*25)
        html
=askurl
(baseurl
)
        soup
=BeautifulSoup
(html
,'html.parser')
        for item 
in soup
.find_all
('div',class_
='item'):
            data
=[]
            item
=str(item
)
            link
=re
.findall
(findLink
,item
)[0]
            data
.append
(link
)
            datalist
.append
(data
)
    return datalist
def saveData(datalist
,savepath
):
    book
=xlwt
.Workbook
(encoding
='utf-8')
    sheet
=book
.add_sheet
('豆瓣电影')
    col
=('电影链接')
    sheet
.write
(0,0,col
)
    for i 
in range(0,250):
        print('第%d条'%(i
+1))
        data
=datalist
[i
]
        sheet
.write
(i
+1,0,data
[0])
    book
.save
(savepath
)
    
def askurl(baseurl
):
    headers
={
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36"
    }
    request
=urllib
.request
.Request
(baseurl
,headers
=headers
)
    html
=''
    response
=urllib
.request
.urlopen
(request
)
    html
=response
.read
().decode
('utf-8')
    return html
if __name__
=='__main__':
    main
()
 
www.dan
                
                
                
        
    
 
                    转载请注明原文地址: https://lol.8miu.com/read-31005.html