由于工作原因,需要下载大量知网上的统计数据,便学习了用python写爬虫,之后效率提高了n倍,代码如下:
# -*- coding: utf-8 -*- import urllib.request from bs4 import BeautifulSoup import requests import time import random import re def get_result(ybcode,page=1): #数据的请求 data = {'ybcode': ybcode, 'entrycode': '', 'page': page, 'pagerow': '20'} headers = { 'Content-Type': 'application/x-www-form-urlencoded', } url = "http://data.cnki.net/Yearbook/PartialGetCatalogResult" params = urllib.parse.urlencode(data).encode(encoding='utf-8') req = urllib.request.Request(url, params, headers) r = urllib.request.urlopen(req) res = str(r.read(),'utf-8') return res def get_pageno(ybcode): #获取总页数 soup = BeautifulSoup(get_result(ybcode), 'lxml') pages=int(soup.select('.s_p_listl')[0].get_text().split("共")[2].split('页')[0]) print ('总共'+pages+'页') return pages def dataclear(data): #数据的清理,除去文本中所有的\n和\r data=re.sub('\n+',' ',data) data = re.sub('\r+', ' ', data) data=re.sub(' +',' ',data) return data def filedata(ybcode): #下载知网的统计年鉴之类的所有excel表 pageno=get_pageno(ybcode) for i in range(1,pageno+1,1): print ('########################################当前第'+str(i)+'页###################################') soup=BeautifulSoup(get_result(ybcode,i),'lxml') for j in soup.select('tr'): s=BeautifulSoup(str(j),'lxml') if len(s.select('img[src="/resources/design/images/nS_down2.png"]'))==0: pass else: try: if len(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a'))>=2: title= str(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(1) > a')[0].get_text()) url= 'http://data.cnki.net'+BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[1].get('href') title=dataclear(title) #若不清洗数据,则文件名中会包含\n等特殊字符,导致文件下载错误 filedown(title,url) print(title) except Exception as e: print ('error:-------------------'+str(e)) pass def filedown(title,url): #文件下载函数 try: r = requests.get(url) with open(title + ".xls", "wb") as code: code.write(r.content) except Exception as e: pass x = random.randint(1,2) time.sleep(x) if __name__=='__main__': ybcode = 'N2013060059' #更改此项可下载其他年鉴 filedata(ybcode)