python爬取招聘网站(实习网)并可视化展示

it2024-12-01  23

目标:

爬取实习网(招聘网站):https://www.shixi.com/search/index?key=%E5%A4%A7%E6%95%B0%E6%8D%AE 展示各区域招聘总量,并按降序排列在前端显示; 展示各地平均工资的差异,并在前端显示

1、编写爬虫文件testspider.py

获取到的字段有 name:职位名 company:公司名 grude:学历要求 place:地点 salary:工资 time:时间 people_num:招聘人数 demand:要求

把爬取到的数据保存到work_data.csv文件里

import re import pymysql from scrapy import Selector import pandas as pd import json import csv import requests class testspider(): def __init__(self): self.url = "https://www.shixi.com/search/index?key=%E5%A4%A7%E6%95%B0%E6%8D%AE" # 模拟用户浏览 header = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.132 Safari/537.36" self.headers = { "header": header } # 获取详情页的数据-招聘人数-职位要求 def parse_detail(self,response): select = Selector(text=response) people_num = select.xpath('/html/body/div/div[3]/div[1]/div[1]/div[2]/span[@class="wantyou"]/text()').extract() people_num = ''.join(people_num) demand = select.xpath('/html/body/div/div[3]/div[1]/div[2]/div[@class="work_b"]/text()').extract()[0].split() demand = ''.join(demand) if not demand: return people_num,"无" return people_num,demand def parse_one_work(self, one_work): select = Selector(text=one_work) # 职位名 name = select.xpath('.//div[1]/dl/dt/a/text()').extract_first().split() name = ''.join(name) # 公司名 company = select.xpath('.//div[1]/dl/dd/div/a/text()').extract()[0] # 学历要求 grude = select.xpath('.//div[1]/dl/dd/span/text()').extract()[0] # 地点 place = select.xpath('.//div[2]/div/span[1]/a/text()').extract()[0] # 工资 salary = select.xpath('.//div[2]/div/div/text()').extract()[0].split() salary = ''.join(salary) # 发布招聘的时间 time = select.xpath('.//div[2]/div/span[2]/text()').extract()[0].split() time = ''.join(time) # 跳转详情页的链接 links = select.xpath('.//div[1]/dl/dt/a/@href').extract() link = ''.join(links) link = "https://www.shixi.com"+link # 解析详情页页面 user_response = requests.get(link).text people_num,demand = self.parse_detail(user_response) # print(name,company,grude,place,salary,time,people_num,demand) result={ "name": name, "company": company, "grude": grude, "place": place, "salary": salary, "time": time, "people_num": people_num, "demand": demand } return result # 获取请求的页面 def get_url(self,page): params = { "page": page } response = requests.get(self.url, headers=self.headers, params=params).text return response # 解析页面 def parse_html(self, response): result = [] select = Selector(text=response) work_list = select.css("div[class=job-pannel-list]").extract() for one_work in work_list: one_result = self.parse_one_work(one_work=one_work) result.append(one_result) return result # 保存csv文件 def dataToCsv(self, work_list): df = pd.DataFrame(work_list) # mode=a,以追加模式写入,header表示列名,默认为true,index表示行名,默认为true,再次写入不需要行名 df.to_csv('work_data.csv',index = False,header=0,mode='a+') def run(self): filename = 'job_data' for i in range(4,10): # 爬取前多少页 page = i print("抓取到第{}页".format(page)) response = self.get_url(page) # 请求页面 result = self.parse_html(response) #解析页面 # print(result) print("抓取到{}条数据".format(len(result))) self.dataToCsv(result) if __name__ == '__main__': testspider = testspider() testspider.run()

2、处理爬取到的数据

主要对salary和people_num列处理 对salary和people_num处理后只保留整数 然后根据place列对salary列做求平均值处理并保存为salary_data.csv文件 然后根据place列对people_num列做求平均值处理并保存为people_data.csv文件

import pandas as pd data = pd.read_csv('data/work_data.csv') # 处理城市列,只取前面两个字符 data['place'] = data["place"].str[:2] # 去除salary列含有“¥2000/月”的行 data=data[~data['salary'].isin(['¥2000/月'])] # people_num列和salary只保留整数,比如¥250/天保留250 data['people_num'] = data['people_num'].str.extract('(\d+)',expand=False) data['salary'] = data['salary'].str.extract('(\d+)',expand=False) # 去除salary列的含有空字符串的行 data.dropna(subset=['salary'],inplace=True) # 将people_num列和salary的类型转换为整型int data['salary'] = data['salary'].astype(int) data['people_num'] = data['people_num'].astype(int) # salary列根据place列分组 求平均分(从大到小排序),只保留一位小数 # 保存people_data.csv文件 salary = data['salary'].groupby(data['place']) salary_result = salary.mean().sort_values(ascending=False).round(1) salary_data = {'place':salary_result.index,'salary_avg':salary_result.values} salary_data = pd.DataFrame(salary_data) salary_data.to_csv('data/salary_data.csv') # people_num列根据place列分组,求总和(从大到小排序), # 保存salary_data.csv文件 people_num = data['people_num'].groupby(data['place']) people_result = people_num.sum().sort_values(ascending=False) people_data = {'place':people_result.index,'num':people_result.values} people_data = pd.DataFrame(people_data) people_data.to_csv('data/people_data.csv')

3、echarts.py

路由people获取people_data.csv文件的place(地点)列数据和num(招聘人数)列数据 路由salary获取salary_data.csv文件的place(地点)列数据和salary_avg(平均工资)列数据

from flask import Flask, redirect, url_for, request, render_template, flash from sqlalchemy import create_engine import pandas as pd import csv app = Flask(__name__) @app.route('/opinion') def opinion(): return render_template('index.html') @app.route('/people', methods=['GET', 'POST']) def people(): result = pd.read_csv("data/people_data.csv",encoding="utf8") place = result['place'].tolist() num = result['num'].tolist() print(place,num) return {'place': place, 'num': num} @app.route('/salary', methods=['GET', 'POST']) def salary(): result = pd.read_csv("data/salary_data.csv",encoding="utf8") place = result['place'].tolist() salary_avg = result['salary_avg'].tolist() print(place,salary_avg) return {'place': place, 'salary_avg': salary_avg} if __name__ == '__main__': app.run(debug=True, port=5000, host='127.0.0.1')

4、可视化文件index.html

<!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>第三题可视化展示</title> <script src="../static/echarts.min.js"></script> <script src="../static/jquery.min.js"></script> <style> #people { float: left; } #salary { float: left; } </style> </head> <body> <div id="people" style="width: 800px;height: 600px;"></div> <div id="salary" style="width: 800px;height: 600px;"></div> <script> var people_chart = echarts.init(document.getElementById('people')); $.post('/people', function (data) { // 指定图表的配置项和数据 var people_option = { title: { text: '各城市招聘的数量', x: 'center' }, tooltip: { trigger: 'item' }, xAxis: { type: 'category', name: '城市', data: data.place }, yAxis: { type: 'value', name: '数量' }, series: [{ data: data.num, type: 'bar' }] }; // 使用刚指定的配置项和数据显示图表。 people_chart.setOption(people_option); }); var salary_chart = echarts.init(document.getElementById('salary')); $.post('/salary', function (data) { // 指定图表的配置项和数据 var salary_option = { title: { text: '各城市的平均工资(单位 元/天)', x: 'center' }, tooltip: { trigger: 'item' }, xAxis: { type: 'category', name: '城市', data: data.place }, yAxis: { type: 'value', name: '元/天' }, series: [{ data: data.salary_avg, type: 'line' }] }; // 使用刚指定的配置项和数据显示图表。 salary_chart.setOption(salary_option); }); </script> </body> </html>

运行echarts.py文件后,在浏览器输入http://127.0.0.1:5000/opinion可以看到

最新回复(0)