第一次做的爬虫

时间:2022-10-31 16:39:47
#将抓取评论数方法整理成一函数
import re
import json
import requests
commentURL='http://comment5.news.sina.com.cn/page/info?version=1&format=js&\
channel=gn&newsid=comos-{}&group=&compress=0&ie=utf-8&oe=utf-8&page=1&\
page_size=20'

def getCommentCount(newsurl):
m=re.search('doc-i(.*).shtml',newsurl)
newsid=m.group(1)
comments=requests.get(commentURL.format(newsid))
jd=json.loads(comments.text.strip('var data='))
return jd['result']['count']['total']
#将抓取内文信息方法整理成一函数from bs4 import BeautifulSoupfrom datetime import datetimedef getNewsDetail(newsurl):    result={}    res=requests.get(newsurl)    res.encoding='utf-8'    soup=BeautifulSoup(res.text,'html.parser')    result['title']=soup.select('#artibodyTitle')[0].text    result['newssource']=soup.select('.time-source span a')[0].text    timesource=soup.select('.time-source')[0].contents[0].strip()    result['dt']=datetime.strptime(timesource,'%Y年%m月%d日%H:%M')    result['article']=' '.join([p.text.strip() for p in soup.select('#artibody p')[:-1]])    result['editor']=soup.select('.article-editor')[0].text.strip('责任编辑:')    result['comments']=getCommentCount(newsurl)    return result
#建立剖析分页清单链接函数def parseListLinks(url):    newsdetails=[]    res=requests.get(url)    jd=json.loads(res.text.lstrip(' newsloadercallback(').rstrip(');'))    for ent in jd['result']['data']:        newsdetails.append(getNewsDetail(ent['url']))    return newsdetails
#批次抓取每页新闻内文(使用for循环产生多页链接)url='http://api.roll.news.sina.com.cn/zt_list?channel=news&cat_1=gnxw&cat_2==gdxw1||=gatxw||=zs-pl||=mtjj&level==1||=2&show_ext=1&show_all=1&show_num=22&tag=1&format=json&page={}'news_total=[]for i in range(1,3):    newsurl=url.format(i)    newsary=parseListLinks(newsurl)    news_total.extend(newsary)#使用Pandas整理资料    import pandasdf=pandas.DataFrame(news_total)df.head()#将资料保存至Exceldf.to_excel('news.xlsx')


抓取新浪新闻