首先我们要安装好selnium 其中最重要的是要下载好对应的谷歌浏览器驱动文件 下载链接 http://chromedriver.storage.googleapis.com/index.html 然后在使用 webdriver.Chrome()方法的时候可能会报找不到驱动 可以直接把驱动放在项目目录下面 或者把路径写在()里
def __init__(self): self.driver = webdriver.Chrome() self.driver.maximize_window() # 起始网址 self.url = "https://www.lagou.com/"然后就是打开网页等一些模拟点击的操作,这里使用的是xpath定位,直接打开谷歌浏览器右键复制选择xpath就可以了
def search(self,keywords): # 打开网页 self.driver.get(self.url) # 关闭弹窗 self.driver.find_element_by_xpath("//*[@id='cboxClose']").click() self.driver.find_element_by_xpath("//a[@class='tab focus']").send_keys(keywords) # 单击按钮 self.driver.find_element_by_xpath("//*[@id='search_button']").click() # 等待两秒 time.sleep(2) page_source = self.driver.page_source self.driver.quit() return page_source最后就是页面的获取,这边使用了BeautifulSoup
def get_jobs(self,page_source): soup = BeautifulSoup(page_source,'lxml') hot_items = soup.select('.con_list_item') for item in hot_items: d = dict() d['job'] = item.select_one(".position_link > h3").get_text() d['company'] = item.select_one(".company_name > a").get_text() d['salary'] = item.select_one(".money").get_text() print(d)完整的代码如下:
import time import requests from selenium import webdriver from bs4 import BeautifulSoup class lagou: def __init__(self): self.driver = webdriver.Chrome() self.driver.maximize_window() # 起始网址 self.url = "https://www.lagou.com/" def search(self,keywords): # 打开网页 self.driver.get(self.url) # 关闭弹窗 self.driver.find_element_by_xpath("//*[@id='cboxClose']").click() self.driver.find_element_by_xpath("//a[@class='tab focus']").send_keys(keywords) # 单击按钮 self.driver.find_element_by_xpath("//*[@id='search_button']").click() # 等待两秒 time.sleep(2) page_source = self.driver.page_source self.driver.quit() return page_source def get_jobs(self,page_source): soup = BeautifulSoup(page_source,'lxml') hot_items = soup.select('.con_list_item') for item in hot_items: d = dict() d['job'] = item.select_one(".position_link > h3").get_text() d['company'] = item.select_one(".company_name > a").get_text() d['salary'] = item.select_one(".money").get_text() print(d) if __name__ == '__main__': hot = lagou() page_source = hot.search('python') hot.get_jobs(page_source)