前言
记录下学习过程中关于主题 爬虫,本文主要利用的是XPATH来实现分析一个网页中的html码, 同时在一个一级链接下进行关键字查找,爬取特定关键词的 url 标题 来源 发布时间 内容
代码如下:
import requests
, re
, csv
, sys
, time
from lxml
import html
from fake_useragent
import UserAgent
import tkinter
as tk
from tkinter
import filedialog
root
= tk
.Tk
()
root
.withdraw
()
folder_path
=filedialog
.askdirectory
()
file_path
= filedialog
.askopenfilename
()
print(file_path
)
'''选取保存某个文件下的路径
实现选取的文件获得url
'''
with open(file_path
) as file_name
:
reader
=csv
.reader
(file_name
)
result
= list(reader
)
url_path
= list(result
[0:3])
print(url_path
)
url_word
= url_path
[0][:]
del url_word
[0]
startTime
= time
.time
()
fp
= open('E:\The_data_for_topiclcrlaw\my_url\my_url.csv', 'a', newline
='', encoding
='utf-8-sig')
writer
= csv
.writer
(fp
)
writer
.writerow
("URL")
def main():
qtext
= url_word
[0]
for i
in range(1,5):
url
= 'https://search.cctv.com/search.php?qtext={}&sort=relevance&type=web&vtime=&datepid=1&channel=&page={}'.format(qtext
,i
)
try:
headers
= {
"User-Agent": UserAgent
().chrome
}
response
= requests
.get
(url
=url
, headers
=headers
)
response
.encoding
= 'utf-8'
text_html
= response
.text
text_html
= text_html
.replace
(r
'<!--', '"').replace
(r
'-->', '"')
text_html
= html
.fromstring
(text_html
)
text_list
= text_html
.xpath
('//ul//h3[@class="tit"]/span/@lanmu1')
writer
.writerow
(text_list
)
except:
pass
print(text_list
)
print(len(text_list
))
if __name__
== '__main__':
main
()
#
!/usr
?bin
/env
/ python
#
-*- coding
:utf
-8 -*-
# author: lai zheng laing
# datetime: 2020/10/17 9:27
# software: PyCharm
import requests
, re
, csv
, sys
, time
from lxml import html
from fake_useragent import UserAgent
# 记录起始时间
startTime
= time
.time()
# 创建CSV文件,并写入表头信息
fp
= open('E:\The_data_for_topiclcrlaw\cctv_te_langpu_data\关于特定关键词的检索.csv', 'a', newline
='', encoding
='utf-8-sig')
writer
= csv
.writer(fp
)
writer
.writerow(("标题", "起始网页","时间", "URL", "正文内容"))
#
-----------------------抓取数据爬虫函数
-----------------------
def
spider_html_info(url
):
try
:
headers
= {
"User-Agent": UserAgent().chrome # chrome浏览器随机代理
}
response
= requests
.get(url
=url
, headers
=headers
)
response
.encoding
= 'utf-8'
text_html
= response
.text
text_html
= html
.fromstring(text_html
)
print(text_html
)
# 获取下一页链接
,先其他元素获取一页链接,保证程序的强壮性
# next_url = "http://news.cctv.com" + text_html.xpath('/html/body/div[2]/div[1]/ul[1]/li[2]/a[2]/@href')[0]
# print("next_url", next_url)
# 获取多个文章标题
try
:
article_title
= text_html
.xpath('//*[@id="title_area"]/h1//text()')
title
= "".join(article_title
)
if title
== " ":
pass
# title = "".join(text_html.xpath('
print("title = ", title
)
except
:
pass
# 获取发布的时间
try
:
publish_time
= text_html
.xpath('//*[@id="title_area"]/div//text()')
print("publish_time= ",publish_time
)
except
:
pass
try
:
print("url = ", url
)
except
:
pass
# 获取该条新闻的来源
try
:
source_text
= text_html
.xpath('//*[@id="title_area"]/div/a/@href')
source
= source_text
[3:]
except
:
pass
# 爬文本内容
try
:
text_list
= text_html
.xpath('//*[@id="content_area"]//text()')
article_text
= "".join(text_list
)
#print(text_list)
# article_text = "".join(text_list).replace('\r\n', '').replace("\xa0", "").replace("\t", "").replace(source_text,
#
"").replace(
# title, "")
print("article_text= ",article_text
)
except
:
pass
writer
.writerow((title
, source_text
, publish_time
, url
, article_text
,))
except
:
pass
# if url == 'http://www.chinanpo.gov.cn/1944/123496/index.html':
# fp.close()
# # 获取结束时的时间
# endTime = time.time()
# useTime = (endTime - startTime) / 60
# print("该次所获的信息一共使用%s分钟" % useTime)
# # 正常退出程序
# sys.exit(0)
# else:
# return next_url
#
-------------------主函数
-------------------------
def
main():
# url = 'https://news.cctv.com/2020/10/17/ARTIp0AnISoJeLZW79bkffYW201017.shtml' # 125177第一篇文章
file_path
= 'E:/The_data_for_topiclcrlaw/my_url/my_url.csv'
# url = spider_html_info(url)
with
open(file_path
) as file_name
:
reader
= csv
.reader(file_name
)
result
= list(reader
)
del result
[0]
a
= (len(result
[:][:]))
b
= (len(result
[0][:]))
for i in
range(a
):
for j in
range(b
):
url
= spider_html_info(result
[i
][j
])
# for url in my_url_list:
# url = spider_html_info(url)
# while True:
# print("正在爬取第%s篇:" % count, url)
# next_url = spider_html_info(url)
# url = next_url
# count = count + 1
if __name__
== '__main__':
main()