实现主题爬虫——————爬取新闻

it2023-09-25 70

前言

记录下学习过程中关于主题爬虫，本文主要利用的是XPATH来实现分析一个网页中的html码，同时在一个一级链接下进行关键字查找，爬取特定关键词的 url 标题来源发布时间内容

代码如下：

#!/usr?bin/env/ python # -*- coding:utf-8 -*- # author: lai zheng laing # datetime: 2020/10/19 9:45 # software: PyCharm import requests, re, csv, sys, time from lxml import html from fake_useragent import UserAgent import tkinter as tk from tkinter import filedialog root = tk.Tk() root.withdraw() folder_path =filedialog.askdirectory() # 获得选好的文件夹 file_path = filedialog.askopenfilename() # 获得对应的文件夹 print(file_path) '''选取保存某个文件下的路径实现选取的文件获得url ''' with open(file_path) as file_name: reader =csv.reader(file_name) result = list(reader) url_path = list(result[0:3]) print(url_path) # print(url_path[1][1]) # 读取特定列表下的特定的值 url_word = url_path[0][:] del url_word[0] # 记录起始时间 startTime = time.time() # 创建CSV文件，并写入表头信息 fp = open('E:\The_data_for_topiclcrlaw\my_url\my_url.csv', 'a', newline='', encoding='utf-8-sig') writer = csv.writer(fp) writer.writerow("URL") # -------------------主函数------------------------- def main(): qtext = url_word[0] for i in range(1,5): url = 'https://search.cctv.com/search.php?qtext={}&sort=relevance&type=web&vtime=&datepid=1&channel=&page={}'.format(qtext,i) # 125177第一篇文章 # url = spider_html(url) try: headers = { "User-Agent": UserAgent().chrome # chrome浏览器随机代理 } # html乱码的问题，进行转码 response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' text_html = response.text # 提取被注释的html代码里的内容 text_html = text_html.replace(r'', '"') text_html = html.fromstring(text_html) text_list = text_html.xpath('//ul//h3[@class="tit"]/span/@lanmu1') writer.writerow(text_list) except: pass print(text_list) print(len(text_list)) if __name__ == '__main__': main() #!/usr?bin/env/ python # -*- coding:utf-8 -*- # author: lai zheng laing # datetime: 2020/10/17 9:27 # software: PyCharm import requests, re, csv, sys, time from lxml import html from fake_useragent import UserAgent # 记录起始时间 startTime = time.time() # 创建CSV文件，并写入表头信息 fp = open('E:\The_data_for_topiclcrlaw\cctv_te_langpu_data\关于特定关键词的检索.csv', 'a', newline='', encoding='utf-8-sig') writer = csv.writer(fp) writer.writerow(("标题", "起始网页","时间", "URL", "正文内容")) # -----------------------抓取数据爬虫函数----------------------- def spider_html_info(url): try: headers = { "User-Agent": UserAgent().chrome # chrome浏览器随机代理 } response = requests.get(url=url, headers=headers) response.encoding = 'utf-8' text_html = response.text text_html = html.fromstring(text_html) print(text_html) # 获取下一页链接,先其他元素获取一页链接，保证程序的强壮性 # next_url = "http://news.cctv.com" + text_html.xpath('/html/body/div[2]/div[1]/ul[1]/li[2]/a[2]/@href')[0] # print("next_url", next_url) # 获取多个文章标题 try: article_title = text_html.xpath('//*[@id="title_area"]/h1//text()') title = "".join(article_title) if title == " ": pass # title = "".join(text_html.xpath('//*[@id="page_body"]/div[1]/div[1]/div[1]//text()')) print("title = ", title) except: pass # 获取发布的时间 try: publish_time = text_html.xpath('//*[@id="title_area"]/div//text()') print("publish_time= ",publish_time) except: pass try: print("url = ", url) except: pass # 获取该条新闻的来源 try: source_text = text_html.xpath('//*[@id="title_area"]/div/a/@href') source = source_text[3:] except: pass # 爬文本内容 try: text_list = text_html.xpath('//*[@id="content_area"]//text()') article_text = "".join(text_list) #print(text_list) # article_text = "".join(text_list).replace('\r\n', '').replace("\xa0", "").replace("\t", "").replace(source_text, # "").replace( # title, "") print("article_text= ",article_text) except: pass writer.writerow((title, source_text, publish_time, url, article_text,)) except: pass # if url == 'http://www.chinanpo.gov.cn/1944/123496/index.html': # fp.close() # # 获取结束时的时间 # endTime = time.time() # useTime = (endTime - startTime) / 60 # print("该次所获的信息一共使用%s分钟" % useTime) # # 正常退出程序 # sys.exit(0) # else: # return next_url # -------------------主函数------------------------- def main(): # url = 'https://news.cctv.com/2020/10/17/ARTIp0AnISoJeLZW79bkffYW201017.shtml' # 125177第一篇文章 file_path = 'E:/The_data_for_topiclcrlaw/my_url/my_url.csv' # url = spider_html_info(url) with open(file_path) as file_name: reader = csv.reader(file_name) result = list(reader) del result[0] a = (len(result[:][:])) b = (len(result[0][:])) for i in range(a): for j in range(b): url = spider_html_info(result[i][j]) # for url in my_url_list: # url = spider_html_info(url) # while True: # print("正在爬取第%s篇：" % count, url) # next_url = spider_html_info(url) # url = next_url # count = count + 1 if __name__ == '__main__': main()

最新回复(0)