scrapy爬取纵横中文网

it2026-04-13  1

文章目录

任务设计数据库文件结构代码实现下载

任务

使用 scrapy 爬取纵横中文网月票榜把小说名、作者、章节名、内容保存到数据库效果如下图所示

novel 表

chapter 表

设计数据库

CREATE DATABASE IF NOT EXISTS `zongheng`; USE `zongheng`; /*Table structure for table `novel` */ DROP TABLE IF EXISTS `novel`; CREATE TABLE `novel` ( `id` INT(11) NULL AUTO_INCREMENT, `novelName` VARCHAR(255) COLLATE utf8_bin NOT NULL unique, `author` VARCHAR(255) COLLATE utf8_bin NOT NULL, PRIMARY KEY (`id`) ) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin; /*Data for the table `novel` */ /*Table structure for table `chapter` */ DROP TABLE IF EXISTS `chapter`; CREATE TABLE `chapter` ( `id` INT(11) NULL AUTO_INCREMENT, `chapterName` VARCHAR(255) COLLATE utf8_bin NOT NULL, `content` TEXT COLLATE utf8_bin NULL, `novelId` INT(11) NOT NULL, PRIMARY KEY (`id`), KEY `FK_chapter` (`novelId`), CONSTRAINT `FK_chapter` FOREIGN KEY (`novelId`) REFERENCES `novel` (`id`) ) ENGINE=INNODB DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

文件结构

代码实现

创建项目

scrapy startproject zongheng

spider.py

import scrapy from zongheng.items import ZonghengItem import copy class zongHeng(scrapy.Spider): name = "zongheng" start_urls = ["http://www.zongheng.com/"] def parse(self,response): books_info = response.xpath('//ul[@id="monthTicketRankList"]//a') for book in books_info: item = ZonghengItem() href = book.xpath('./@href').extract_first() # 目录url book = book.xpath('./text()').extract_first() # 小说名 # http://book.zongheng.com/book/1013348.html 替换为 http://book.zongheng.com/showchapter/1013348.html href = href[:25]+href[25:].replace("book","showchapter") # 全部目录url print(href,book) item['book'] = book yield scrapy.Request(url=href,callback=self.parse_dir,meta={'item':item}) def parse_dir(self,response): ''' 小说目录 ''' item = response.meta['item'] chapter_info = response.xpath('//li[@class=" col-4"]/a') author = response.xpath('//div[@class="book-meta"]//a/text()').extract_first() print('作者:',author) item['author'] = author for chapter in chapter_info: href = chapter.xpath('./@href').extract_first() # 小说内容url item['chapter'] = chapter.xpath('./text()').extract_first() # 章节名 yield scrapy.Request(url=href,callback=self.parse_content,meta={'item':copy.deepcopy(item)}) # 使用深复制 # 使用Request函数传递item时,使用的是浅复制(对象的字段值被复制时,字段引用的对象不会被复制) def parse_content(self,response): ''' 小说内容 ''' item = response.meta['item'] content = response.xpath('//div[@class="content"]/p/text()').extract() # 小说 p 标签内容 # 小说内容处理 string = "" for s in content: string = string + s + '\n' item['content'] = string yield item

items.py

import scrapy class ZonghengItem(scrapy.Item): book = scrapy.Field() author = scrapy.Field() chapter = scrapy.Field() content = scrapy.Field()

pipelines.py

import pymysql class ZonghengPipeline(object): def __init__(self): self.id = 0 def process_item(self, item, spider): # print(item) # 把数据插入 novel 表 with self.connection.cursor() as cursor: sql = "insert ignore into novel(novelName,author) value(%s,%s)" try: result = cursor.execute(sql,(item['book'],item['author'])) if result != 0: # 没有重复数据 self.id = self.connection.insert_id() self.connection.commit() except Exception as e: print(e) # 把数据插入 chapter 表 with self.connection.cursor() as cursor: sql = "insert into chapter(chapterName,content,novelId) value(%s,%s,%s)" try: cursor.execute(sql,(item['chapter'],item['content'],self.id)) self.connection.commit() except Exception as e: print(e) return item def open_spider(self,spider): print("spider start") self.connection = pymysql.connect(host='localhost', user='root', password='123456', db='zongheng', charset='utf8mb4', cursorclass=pymysql.cursors.DictCursor) def close_spider(self, spider): print("spider end") self.connection.close()

run.py

from scrapy import cmdline cmdline.execute("scrapy crawl zongheng -s LOG_ENABLED=False".split())

settings.py

# 把下面的注释去掉 ITEM_PIPELINES = { 'book.pipelines.BookPipeline': 300, }

下载

https://github.com/CarveStone/scrapy-crawl-zhongheng

最新回复(0)