python抓取谷歌学术关键词下文章题目

时间:2024-04-15 16:41:29

单网页版(建议)

只爬取一个网页,通过手动更改url的数字来实现多个页面的爬取

#encoding = utf8
# write by xdd1997  xdd2026@qq.com
# 2020-08-21

\'\'\'还是不建议一次爬取多个页面,容易被封,解封时长未知\'\'\'
import requests
from bs4 import BeautifulSoup
ii = 90
url = "https://scholar.paodekuaiweixinqun.com/scholar?start={}&q=Cylindrical+Shells&hl=zh-CN&as_sdt=0,5&as_ylo=2016".format(ii)
# https://scholar.paodekuaiweixinqun.com/scholar?start=140&q=Cylindrical+Shells&hl=zh-CN&as_sdt=0,5&as_ylo=2016
print(url)
try:
    kv = {\'user-agent\':\'Mozilla/5.0\'}   #应对爬虫审查
    r = requests.get(url,headers=kv)
    r.raise_for_status()                  #若返回值不是202,则抛出一个异常
    r.encoding = r.apparent_encoding
except:
    print("进入网站失败")
demo = r.text
soup = BeautifulSoup(demo, "html.parser")
#print(soup)
print(\'----------------------------------------------------------------------------------------------\')
paperlist = []
for ss in soup.find_all(\'a\',{"target":"_blank"}): # 查找<ul class="f-hide"> ...</ul>  ,{"target":"_blank"}
    tex = ss.get_text().replace(\'  \',\'\').split(\'\n\')
    texp = \'\'
    if len(tex) >= 6:
        for t in tex:
            if t !=None:
                texp = texp + t
        paperlist.append(texp)
#print(paperlist)
for paper in paperlist:
    if len(paper)>30:  # 排除类似于[PDF] researchgate.net一样的文本
        print(paper)

多网页版版(注意,注意,注意)

注意:很有可能会被封,具体多长时间不清楚

关于被封:比如程序爬爬爬,爬到第9页的时候谷歌发现了,把你封了,那这一页你就打不开了,手动也打不开,其他页页间隔存在打不开的情况

#encoding = utf8
# write by xdd1997  xdd2026@qq.com
# 2020-08-21
\'\'\'容易被封,容易被封,容易被封\'\'\'
容易被封
import requests
from bs4 import BeautifulSoup
import time
import random
for ii in range(0,80,10):  # 爬取到90.html时会被禁
    url = "https://scholar.paodekuaiweixinqun.com/scholar?start={}&q=Cylindrical+Shells&hl=zh-CN&as_sdt=0,5&as_ylo=2016".format(ii)    #
    print(url)
    try:
        kv = {\'user-agent\':\'Mozilla/5.0\'}   #应对爬虫审查
        r = requests.get(url,headers=kv)
        r.raise_for_status()                  #若返回值不是202,则抛出一个异常
        r.encoding = r.apparent_encoding
    except:
        print("进入网站失败")
    demo = r.text
    soup = BeautifulSoup(demo, "html.parser")
    #print(soup)
    print(\'----------------------------------------------------------------------------------------------\')
    for ss in soup.find_all(\'a\',{"target":"_blank"}): # 查找<ul class="f-hide"> ...</ul>  ,{"target":"_blank"}
       # for ii in ss.find_all(\'b\'):
        tex = ss.get_text().replace(\'  \',\'\').split(\'\n\')
        if len(tex) == 7:
            print(tex[1]+ \' \' + tex[3]+ \' \'+ tex[6])
    time.sleep(random.random()*10 + 5)