爬取高中单词
import re import codecs from urllib import request, error from bs4 import BeautifulSoup def askurl(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} req = request.Request(url=url, headers=headers) respond = request.urlopen(req) html = respond.read().decode('utf-8') return html except error.URLError as e: if hasattr(e, 'code'): print(e.code) if hasattr(e, 'reason'): print(e.reason) FindContent = re.compile(r'<p>(.*?)</p>', re.S) def getdata(): datalist = [] baseurl = 'http://www.1mpi.com/doc/eea782580808987333652d93/' for i in range(21): url = baseurl + str(i+1) html = askurl(url) soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', {'class':'contents', "id":"contents"}): item = str(item) content = FindContent.findall(item) content = [i.replace('△', '') for i in content] try: if i!=18: del content[0], content[1], content[2] del content[0] except Exception as reason: print(i, reason, content) content[0] = re.sub('必修一 UNIT.{2}', '', content[0]) datalist.extend(content) return datalist def savedata(savepath): datalist = getdata() with codecs.open(savepath, 'w', 'utf-8') as file: for i in range(len(datalist)): file.write(datalist[i]) def main(): savedata('d:\\high school word.txt') if __name__=='__main__': main()爬取计算机专业核心单词(数据未清洗,清洗过程在下一篇绘图博客中)
import re import codecs from urllib import request, error from bs4 import BeautifulSoup def askurl(url): try: headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.108 Safari/537.36'} req = request.Request(url=url, headers=headers) respond = request.urlopen(req) html = respond.read().decode('utf-8') return html except error.URLError as e: if hasattr(e, 'code'): print(e.code) if hasattr(e, 'reason'): print(e.reason) FindContent = re.compile(r'<p>(.*?)</p>', re.S) def getdata(): url = 'https://www.hujiang.com/c/kyyych/p1273859/' html = askurl(url) soup = BeautifulSoup(html, 'html.parser') for item in soup.find_all('div', {'class':'article-content', "id":"J-article-content"}): item = str(item) content = FindContent.findall(item) return content def savedata(savepath): datalist = getdata() print(datalist) with codecs.open(savepath, 'w', 'utf-8') as file: for i in range(len(datalist)): file.write(datalist[i]) def main(): savedata('d:\\computer major words.txt') if __name__=='__main__': main()