process_request() 拦截请求
使用UA池(不推荐)
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called """ 函数说明:拦截请求 :param request: :param spider: :return: """ # UA伪装 request.headers['User-Agent'] = rando.chiose(self.USER_AGENT_LIST) return None使用 fake-useragent 模块(推荐)
安装模块:pip install fake-useragent
def process_request(self, request, spider): # Called for each request that goes through the downloader # middleware. # Must either: # - return None: continue processing this request # - or return a Response object # - or return a Request object # - or raise IgnoreRequest: process_exception() methods of # installed downloader middleware will be called """ 函数说明:拦截请求 :param request: :param spider: :return: """ # UA伪装 request.headers['User-Agent'] = UserAgent().randomprocess_response() 拦截所有的响应
这里以 爬取网易新闻为例process_exception() 拦截异常的请求
代理IP
PROXY_http = [ '153.180.102.104:80', '195.208.131.189:56055' ] PROXY_https = [ '120.83.49.90:9000', '95.189.112.214:35508' ] def process_exception(self, request, exception, spider): # Called when a download handler or a process_request() # (from other downloader middleware) raises an exception. # Must either: # - return None: continue processing this exception # - return a Response object: stops process_exception() chain # - return a Request object: stops process_exception() chain """ 函数说明:拦截发生异常的请求 :param request: :param exception: :param spider: :return: """ # 代理IP if request.url.split(':')[0] == 'http': request.meta['proxy'] = 'http://' + random.choice(self.PROXY_http) else: request.meta['proxy'] = 'https://' + random.choice(self.PROXY_https) # 请修正之后的请求对象进行重新的请求发送 return request