爬取该网站的招聘信息
首先用xpath规则匹配页面的元素匹配不到,由此初步得出该页面是动态加载的。然后在页面源码中发现要找的数据在一个js中,故最终得出该页面是动态加载的
a51job.py
import json import scrapy from LearnScrapy1.items import CompanyItem, JobDescItem class A51jobSpider(scrapy.Spider): name = '51job' allowed_domains = ['search.51job.com'] start_urls = [ 'https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,1.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0'] base_urls = 'https://search.51job.com/list/010000,000000,0000,00,9,99,python,2,%d.html?lang=c&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&ord_field=0&dibiaoid=0' def parse(self, response): # print(response) # print(response.text) js = response.xpath('//script[contains(@type,"text/javascript") and not(@src)]').extract_first() r = js.split("__SEARCH_RESULT__")[1][3:-9] d = json.loads(r) companies = d.get("engine_search_result") for company in companies: company_item = CompanyItem() company_item["company_name"] = company.get("company_name") company_item["job_name"] = company.get("job_name") company_item["provide_salary_text"] = company.get("providesalary_text") company_item["job_href"] = company.get("job_href") company_item["work_area_text"] = company.get("workarea_text") yield company_item yield scrapy.Request(url=company.get("job_href"), callback=self.parse_job_detail, dont_filter=True) current_page = int(d.get('curr_page')) if current_page == 1: total_page = int(d.get('total_page')) for i in range(2, total_page): yield scrapy.Request(url=self.base_urls % i, callback=self.parse, dont_filter=True) def parse_job_detail(self, response): result = response.xpath('//div[contains(@class,"bmsg job_msg inbox")]/p/text()').extract() or response.xpath( "//div[contains(@class, 'bmsg job_msg inbox')]/div/span/text()").extract() print(result) job = JobDescItem() job["content"] = ";".join(result) yield jobpipelines.py
# Define your item pipelines here # # Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://docs.scrapy.org/en/latest/topics/item-pipeline.html # useful for handling different item types with a single interface import MySQLdb from itemadapter import ItemAdapter from LearnScrapy1.items import CompanyItem, JobDescItem class Learnscrapy1Pipeline: def open_spider(self,spider): conn = MySQLdb.Connect(host='localhost',user='root',password='6666',port=3306,database='job',charset='utf8') self.conn = conn self.cursor = conn.cursor() def close_spider(self,spider): self.conn.close() def process_item(self, item, spider): if isinstance(item,CompanyItem): # print(item.get('company_name')) # print(item.get('provide_salary_text')) # print(item.get('job_href')) # print(item.get('work_area_text')) # print(item.get('job_name')) self.cursor.execute("insert into company(company_name,job_href,company_addr,salary,job_name) values('%s'," "'%s','%s','%s','%s');" % (item.get("company_name"),item.get("job_href"),item.get("work_area_text"), item.get("provide_salary_text"),item.get("job_name"))) self.conn.commit() if isinstance(item,JobDescItem): self.cursor.execute("insert into job_desc(content) values('%s');" % item.get("content")) self.conn.commit() return itemitems.py
# Define here the models for your scraped items # # See documentation in: # https://docs.scrapy.org/en/latest/topics/items.html import scrapy class Learnscrapy1Item(scrapy.Item): # define the fields for your item here like: # name = scrapy.Field() pass class CompanyItem(scrapy.Item): company_name = scrapy.Field() provide_salary_text = scrapy.Field() job_href = scrapy.Field() work_area_text = scrapy.Field() job_name = scrapy.Field() class JobDescItem(scrapy.Item): content = scrapy.Field()