python网络爬虫--项目实战--用scrapy框架爬取王者荣耀英雄信息（3）

it2023-08-03 75

wzry.py

import scrapy from LearnScrapy.items import HeroItem class WzrySpider(scrapy.Spider): name = 'wzry' allowed_domains = ['pvp.qq.com'] start_urls = ['https://pvp.qq.com/web201605/herolist.shtml'] hero_detail_base_url = "https://pvp.qq.com/web201605/" def parse(self, response): # print(response) hero_list = response.xpath("//div[contains(@class, 'herolist-content')]/ul[contains(@class, 'herolist')]/li/a/@href").extract() # print(hero_list) # for hero_detail in hero_list: # yield scrapy.Request(url=self.hero_detail_base_url + hero_detail, callback=self.parse_hero_detail, meta={"msg": "ok"}) # yield scrapy.Request(url=response.urljoin(hero_detail), callback=self.parse_hero_detail, meta={"msg": "ok"}) # yield response.follow(url=hero_detail, callback=self.parse_hero_detail, meta={"msg": "ok"}) requests = response.follow_all(urls=hero_list, callback=self.parse_hero_detail, meta={"msg": "ok"}) for request in requests: yield request def parse_hero_detail(self, response): print(response.meta.get("msg")) hero_name = response.xpath("//div[contains(@class, 'cover')]/h2[contains(@class, 'cover-name')]/text()").extract_first() skills = response.xpath("//div[contains(@class, 'skill-show')]/div[contains(@class, 'show-list')]") # print(skills) skill_list = [] for skill in skills: skill_name = skill.xpath("./p[contains(@class, 'skill-name')]/b/text()").extract_first() skill_cd = skill.xpath("./p[contains(@class, 'skill-name')]/span[1]/text()").extract_first() skill_consume = skill.xpath("./p[contains(@class, 'skill-name')]/span[2]/text()").extract_first() skill_desc = skill.xpath("./p[contains(@class, 'skill-desc')]/text()").extract_first() if not skill_name and skill_cd=="冷却值：" and skill_consume=="消耗：" and not skill_desc: continue skill_dict = { "skill_name": skill_name, "skill_cd": skill_cd, "skill_consume": skill_consume, "skill_desc": skill_desc } skill_list.append(skill_dict) # data = { # "name": hero_name, # "skills": skill_list # } hero_item = HeroItem() hero_item["hero_name"] = hero_name hero_item["hero_skills"] = skill_list yield hero_item

pipelines,py

# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface # import pymysql import MySQLdb from itemadapter import ItemAdapter from LearnScrapy.spiders.wzry import WzrySpider class LearnscrapyPipeline: def process_item(self, item, spider): return item class HeroPipeline: def process_item(self, item, spider): # 判定对象是否是某一个类的实例 if isinstance(spider, WzrySpider): # print(item) # 数据存储到mysql中，连接mysql conn = MySQLdb.Connect(host="127.0.0.1", port=3306, user="rock", password="rock1204", database="WZRY", charset="utf8") cursor = conn.cursor() cursor.execute("insert into hero(hero_name) values ('%s');" % item.get("hero_name")) conn.commit() cursor.execute("select id from hero where hero_name='%s';" % item.get("hero_name")) hero_info = cursor.fetchone() # print(hero_info) hero_id = hero_info[0] hero_skills = item.get("hero_skills") for hero_skill in hero_skills: cursor.execute("insert into skill(skill_name, skill_cd, skill_consume, skill_desc, hero_id) " "values ('%s', '%s', '%s', '%s', %d);" % (hero_skill.get("skill_name"), hero_skill.get("skill_cd"), hero_skill.get("skill_consume"), hero_skill.get("skill_desc"), hero_id)) conn.commit() return item

最新回复(0)