python爬虫(始)

it2025-02-17  24

可能代码格式需要调整

文章目录

一:打开爬取网页二:下载图片三:爬取有道翻译功能爬取有道翻译功能(改进) 四:爬取ip地址

一:打开爬取网页

# 爬取网页源代码 import urllib.request ''' #请求访问url req = urllib.request.Request(url) #打开url res = urllib.request.urlopen(req) ''' #请求打开url res = urllib.request.urlopen(r'http://taobao.com') #读取爬到的源代码 html = res.read().decode('utf-8') print(html)

二:下载图片

#下载图片 import urllib.request res = urllib.request.urlopen('http://placekitten.com/200/300') cat_img = res.read() #实现try-finally功能检查是否正确关闭文件了 with open('cat_200_300_img.jpg', 'wb') as f: #下载f写入cat_img f.write(cat_img)

三:爬取有道翻译功能

# 爬取有道翻译功能 # http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule import urllib.request import urllib.parse import json content = input('请输入需要翻译的内容:') url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule' data = {} data['i'] = content data['from'] = 'AUTO' data['to'] = 'AUTO' data['smartresult'] = 'dict' data['client'] = 'fanyideskweb' data['salt']= '16032792430152' data['sign'] = '2104aa5f2617308d1e4943d792c3cc16' data['lts'] = '1603279243015' data['bv']: '328517d280da8271413e56aa2fb123bf' data['doctype'] = 'json' data['version'] = '2.1' data['keyfrom'] = 'fanyi.web' data['action']= 'FY_BY_CLICKBUTTION' data = urllib.parse.urlencode(data).encode('utf-8') res = urllib.request.urlopen(url, data) html = res.read().decode('utf-8') #以字符串形式展示爬取到的内容 target = json.loads(html) print('翻译结果:%s' % target['translateResult'][0][0]['tgt'])
爬取有道翻译功能(改进)
# http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule import urllib.request import urllib.parse import json import time #实现多次翻译功能 while True: content = input('请输入需要翻译的内容(输入"r!"退出程序):') if content == 'r!': break url = 'http://fanyi.youdao.com/translate?smartresult=dict&smartresult=rule' #添加个header,作用:看起来更像人在访问 head= {} head['user-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36' data = {} data['i'] = content data['from'] = 'AUTO' data['to'] = 'AUTO' data['smartresult'] = 'dict' data['client'] = 'fanyideskweb' data['salt']= '16032792430152' data['sign'] = '2104aa5f2617308d1e4943d792c3cc16' data['lts'] = '1603279243015' data['bv']: '328517d280da8271413e56aa2fb123bf' data['doctype'] = 'json' data['version'] = '2.1' data['keyfrom'] = 'fanyi.web' data['action']= 'FY_BY_CLICKBUTTION' data = urllib.parse.urlencode(data).encode('utf-8') # Request 的 headers 参数 req = urllib.request.Request(url, data, head) # req.add_header('user-Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36') res = urllib.request.urlopen(url, data) html = res.read().decode('utf-8') target = json.loads(html) print('翻译结果:%s' % target['translateResult'][0][0]['tgt']) #设置间隔时间,作用:看起来更像人在访问 #每隔5秒钟,实现一次翻译功能 time.sleep(5)

四:爬取ip地址

import urllib.request import re #打开网页url def open_url(url): req = urllib.request.Request(url) req.add_header('User_Agent', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.121 Safari/537.36') page = urllib.request.urlopen(req) html = page.read().decode('utf-8') return html #获取ip地址 def get_ip(html): #利用正则表达式,匹配ip地址 p = r'(?:(?:[01]?\d?\d|2[0-4]\d|25[0-5])\.){3}(?:[01]?\d?\d|2[0-4]\d|25[0-5])' #匹配所有符合条件的ip地址 ip_list = re.findall(p, html) #输出每一个匹配到的ip地址 for each in ip_list: print(each) #判断是否为主程序 if __name__ == '__main__': url = 'http://taobao.com' get_ip(open_url(url))
最新回复(0)