记第一次知网爬虫实践

时间:2022-10-26 16:41:38

由于工作原因,需要下载大量知网上的统计数据,便学习了用python写爬虫,之后效率提高了n倍,代码如下:

# -*- coding: utf-8 -*-
import urllib.request
from bs4 import BeautifulSoup
import requests
import time
import random
import re


def get_result(ybcode,page=1):  #数据的请求
    data = {'ybcode': ybcode, 'entrycode': '', 'page': page, 'pagerow': '20'}
    headers = {
        'Content-Type': 'application/x-www-form-urlencoded',
    }
    url = "http://data.cnki.net/Yearbook/PartialGetCatalogResult"  
    params = urllib.parse.urlencode(data).encode(encoding='utf-8')
    req = urllib.request.Request(url, params, headers)
    r = urllib.request.urlopen(req)
    res = str(r.read(),'utf-8')
    return res

def get_pageno(ybcode):  #获取总页数
    soup = BeautifulSoup(get_result(ybcode), 'lxml')
    pages=int(soup.select('.s_p_listl')[0].get_text().split("共")[2].split('页')[0])
	print ('总共'+pages+'页')
    return pages


def dataclear(data):  #数据的清理,除去文本中所有的\n和\r
    data=re.sub('\n+',' ',data)
    data = re.sub('\r+', ' ', data)
    data=re.sub(' +',' ',data)
    return data


def filedata(ybcode): #下载知网的统计年鉴之类的所有excel表
    pageno=get_pageno(ybcode)
    for i in range(1,pageno+1,1):  
        print ('########################################当前第'+str(i)+'页###################################')
        soup=BeautifulSoup(get_result(ybcode,i),'lxml')
        for j in soup.select('tr'):
            s=BeautifulSoup(str(j),'lxml')
            if len(s.select('img[src="/resources/design/images/nS_down2.png"]'))==0:
                pass
            else:
                try:
                    if len(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a'))>=2:
                        title= str(BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(1) > a')[0].get_text())
                        url= 'http://data.cnki.net'+BeautifulSoup(str(j), 'lxml').select('td:nth-of-type(3) > a')[1].get('href')
                        title=dataclear(title) #若不清洗数据,则文件名中会包含\n等特殊字符,导致文件下载错误
                        filedown(title,url)
                        print(title)
                except Exception as e:
                    print ('error:-------------------'+str(e))
                    pass

def filedown(title,url):  #文件下载函数
    try:
        r = requests.get(url)
        with open(title + ".xls", "wb") as code:
            code.write(r.content)
    except Exception as e:
        pass
    x = random.randint(1,2)
    time.sleep(x)

if __name__=='__main__':
    ybcode = 'N2013060059'  #更改此项可下载其他年鉴
    filedata(ybcode)