单网页版(建议)
只爬取一个网页,通过手动更改url的数字来实现多个页面的爬取
#encoding = utf8 # write by xdd1997 xdd2026@qq.com # 2020-08-21 \'\'\'还是不建议一次爬取多个页面,容易被封,解封时长未知\'\'\' import requests from bs4 import BeautifulSoup ii = 90 url = "https://scholar.paodekuaiweixinqun.com/scholar?start={}&q=Cylindrical+Shells&hl=zh-CN&as_sdt=0,5&as_ylo=2016".format(ii) # https://scholar.paodekuaiweixinqun.com/scholar?start=140&q=Cylindrical+Shells&hl=zh-CN&as_sdt=0,5&as_ylo=2016 print(url) try: kv = {\'user-agent\':\'Mozilla/5.0\'} #应对爬虫审查 r = requests.get(url,headers=kv) r.raise_for_status() #若返回值不是202,则抛出一个异常 r.encoding = r.apparent_encoding except: print("进入网站失败") demo = r.text soup = BeautifulSoup(demo, "html.parser") #print(soup) print(\'----------------------------------------------------------------------------------------------\') paperlist = [] for ss in soup.find_all(\'a\',{"target":"_blank"}): # 查找<ul class="f-hide"> ...</ul> ,{"target":"_blank"} tex = ss.get_text().replace(\' \',\'\').split(\'\n\') texp = \'\' if len(tex) >= 6: for t in tex: if t !=None: texp = texp + t paperlist.append(texp) #print(paperlist) for paper in paperlist: if len(paper)>30: # 排除类似于[PDF] researchgate.net一样的文本 print(paper)
多网页版版(注意,注意,注意)
注意:很有可能会被封,具体多长时间不清楚
关于被封:比如程序爬爬爬,爬到第9页的时候谷歌发现了,把你封了,那这一页你就打不开了,手动也打不开,其他页页间隔存在打不开的情况
#encoding = utf8 # write by xdd1997 xdd2026@qq.com # 2020-08-21 \'\'\'容易被封,容易被封,容易被封\'\'\'
容易被封 import requests from bs4 import BeautifulSoup import time import random for ii in range(0,80,10): # 爬取到90.html时会被禁 url = "https://scholar.paodekuaiweixinqun.com/scholar?start={}&q=Cylindrical+Shells&hl=zh-CN&as_sdt=0,5&as_ylo=2016".format(ii) # print(url) try: kv = {\'user-agent\':\'Mozilla/5.0\'} #应对爬虫审查 r = requests.get(url,headers=kv) r.raise_for_status() #若返回值不是202,则抛出一个异常 r.encoding = r.apparent_encoding except: print("进入网站失败") demo = r.text soup = BeautifulSoup(demo, "html.parser") #print(soup) print(\'----------------------------------------------------------------------------------------------\') for ss in soup.find_all(\'a\',{"target":"_blank"}): # 查找<ul class="f-hide"> ...</ul> ,{"target":"_blank"} # for ii in ss.find_all(\'b\'): tex = ss.get_text().replace(\' \',\'\').split(\'\n\') if len(tex) == 7: print(tex[1]+ \' \' + tex[3]+ \' \'+ tex[6]) time.sleep(random.random()*10 + 5)