目录
一 、实现思路二、获取url变化规律三、爬取新闻名称及其超链接四、判断与主题的契合度四、输出结果五、总代码
一 、实现思路
本次爬取搜狐新闻时政类
获取url——爬取新闻名称及其超链接——判断与主题契合度——得到最终结果
二、获取url变化规律
观察发现,搜狐新闻页面属于动态页面 但是F12——network——XHR下并没有文件所以不能从这里找 从ALL中发现该文件中有想要找的内容 发现该文件属于js文件 观察四个feed开头的文件的url规律 page变化 callback变化无规律 最后的数字每页+8 将callback去掉发现对网页内容无影响 所以最终的page获取代码 采用字符串拼接的形式
for p
in range(1,10):
p2
=1603263206992+p
*8
url
='https://v2.sohu.com/public-api/feed?scene=CATEGORY&sceneId=1460&page='+str(p
)+'&size=20&_='+str(p2
)
三、爬取新闻名称及其超链接
本次用正则表达式获取
实现代码:
headers
={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
'cookie':'itssohu=true; BAIDU_SSP_lcr=https://news.hao123.com/wangzhi; IPLOC=CN3300; SUV=201021142102FD7T; reqtype=pc; gidinf=x099980109ee124d51195e802000a3aab2e8ca7bf7da; t=1603261548713; jv=78160d8250d5ed3e3248758eeacbc62e-kuzhE2gk1603261903982; ppinf=2|1603261904|1604471504|bG9naW5pZDowOnx1c2VyaWQ6Mjg6MTMxODgwMjEyODc2ODQzODI3MkBzb2h1LmNvbXxzZXJ2aWNldXNlOjMwOjAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMHxjcnQ6MTA6MjAyMC0xMC0yMXxlbXQ6MTowfGFwcGlkOjY6MTE2MDA1fHRydXN0OjE6MXxwYXJ0bmVyaWQ6MTowfHJlbGF0aW9uOjA6fHV1aWQ6MTY6czExZjVhZTI2NTJiNmM3Nnx1aWQ6MTY6czExZjVhZTI2NTJiNmM3Nnx1bmlxbmFtZTowOnw; pprdig=L2Psu-NwDR2a1BZITLwhlxdvI2OrHzl6jqQlF3zP4z70gqsyYxXmf5dCZGuhPFZ-XWWE5mflwnCHURGUQaB5cxxf8HKpzVIbqTJJ3_TNhPgpDMMQdFo64Cqoay43UxanOZJc4-9dcAE6GU3PIufRjmHw_LApBXLN7sOMUodmfYE; ppmdig=1603261913000000cfdc2813caf37424544d67b1ffee4770'
}
res
=requests
.get
(url
,headers
=headers
)
soup
=BeautifulSoup
(res
.text
,'lxml')
news
=re
.findall
('"mobileTitle":"(.*?)",',str(soup
))
herf
=re
.findall
('"originalSource":"(.*?)"',str(soup
))
news_dic
=dict(zip(news
,herf
))
for k
,v
in news_dic
.items
():
news_dictall
[k
]=v
四、判断与主题的契合度
def ifsim(topicwords
):
news_dicfin
={}
news_dic
=getdata
()
ana
.set_stop_words
('D:\作业\python\文本挖掘\数据集\新闻数据集\data\stopwords.txt')
for k
,v
in news_dic
.items
():
word_list
=ana
.extract_tags
(k
,topK
=50,withWeight
=False)
word_lil
=[]
for i
in word_list
:
word_lil
.append
([i
])
word_dic
=Dictionary
(word_lil
)
d
=dict(word_dic
.items
())
docwords
=set(d
.values
())
commwords
=topicwords
.intersection
(docwords
)
if len(commwords
)>0:
news_dicfin
[k
]=v
print(news_dicfin
)
若直接输出word_dic结果为: docwords输出结果为:
word_list输出结果: word_lil输出结果为: d的输出结果为:
四、输出结果
本次通过判断标题与我给定主题词的相同的个数即交集>0即判定该词属于主题模型 并将其存入最终字典 news_sicfin的输出结果为:
五、总代码
import requests
from bs4
import BeautifulSoup
import jieba
from gensim
.corpora
.dictionary
import Dictionary
import re
import jieba
.analyse
as ana
def getdata():
news_dictall
={}
for p
in range(1,10):
p2
=1603263206992+p
*8
url
='https://v2.sohu.com/public-api/feed?scene=CATEGORY&sceneId=1460&page='+str(p
)+'&size=20&_='+str(p2
)
headers
={
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/86.0.4240.75 Safari/537.36',
'cookie':'itssohu=true; BAIDU_SSP_lcr=https://news.hao123.com/wangzhi; IPLOC=CN3300; SUV=201021142102FD7T; reqtype=pc; gidinf=x099980109ee124d51195e802000a3aab2e8ca7bf7da; t=1603261548713; jv=78160d8250d5ed3e3248758eeacbc62e-kuzhE2gk1603261903982; ppinf=2|1603261904|1604471504|bG9naW5pZDowOnx1c2VyaWQ6Mjg6MTMxODgwMjEyODc2ODQzODI3MkBzb2h1LmNvbXxzZXJ2aWNldXNlOjMwOjAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMDAwMHxjcnQ6MTA6MjAyMC0xMC0yMXxlbXQ6MTowfGFwcGlkOjY6MTE2MDA1fHRydXN0OjE6MXxwYXJ0bmVyaWQ6MTowfHJlbGF0aW9uOjA6fHV1aWQ6MTY6czExZjVhZTI2NTJiNmM3Nnx1aWQ6MTY6czExZjVhZTI2NTJiNmM3Nnx1bmlxbmFtZTowOnw; pprdig=L2Psu-NwDR2a1BZITLwhlxdvI2OrHzl6jqQlF3zP4z70gqsyYxXmf5dCZGuhPFZ-XWWE5mflwnCHURGUQaB5cxxf8HKpzVIbqTJJ3_TNhPgpDMMQdFo64Cqoay43UxanOZJc4-9dcAE6GU3PIufRjmHw_LApBXLN7sOMUodmfYE; ppmdig=1603261913000000cfdc2813caf37424544d67b1ffee4770'
}
res
=requests
.get
(url
,headers
=headers
)
soup
=BeautifulSoup
(res
.text
,'lxml')
news
=re
.findall
('"mobileTitle":"(.*?)",',str(soup
))
herf
=re
.findall
('"originalSource":"(.*?)"',str(soup
))
news_dic
=dict(zip(news
,herf
))
for k
,v
in news_dic
.items
():
news_dictall
[k
]=v
return(news_dictall
)
def ifsim(topicwords
):
news_dicfin
={}
news_dic
=getdata
()
ana
.set_stop_words
('D:\作业\python\文本挖掘\数据集\新闻数据集\data\stopwords.txt')
for k
,v
in news_dic
.items
():
word_list
=ana
.extract_tags
(k
,topK
=50,withWeight
=False)
word_lil
=[]
for i
in word_list
:
word_lil
.append
([i
])
word_dic
=Dictionary
(word_lil
)
d
=dict(word_dic
.items
())
docwords
=set(d
.values
())
commwords
=topicwords
.intersection
(docwords
)
if len(commwords
)>0:
news_dicfin
[k
]=v
print(news_dicfin
)
if __name__
=='__main__':
topicwords
={"疫情","新冠","肺炎","确诊","病例"}
ifsim
(topicwords
)