bs4抓取糗事百科

时间:2022-07-11 09:03:37

抓取糗事百科内容及评论,不包含图片信息。user-agent填入浏览器的即可。user-agent对应的value,360极速浏览器的话,可以在地址栏输入about:version,回车,用户代理后面的一长串就是需要填入''里面的内容。其他的可以自行百度

import urllib.request
import re
from urllib import request
from bs4 import BeautifulSoup #1.获取网页源代码
def get_html(url):
headers = {
'User-Agent': '',
}
req = request.Request(headers=headers,url=url)
response = urllib.request.urlopen(req)
content = response.read().decode('utf-8')
return content #获取评论链接
def get_comment_link(content,comment_url_base):
soup = BeautifulSoup(content,'html.parser')
articleFloor = 1
for string in soup.find_all(attrs=re.compile(r"article block untagged mb15.*?")):
comment = str(string.get('id')).strip().split("_")[2]
comment_url = comment_url_base % comment#评论链接
get_comment_content(comment_url,articleFloor)#获取评论内容
articleFloor += 1 #获取糗事内容及评论内容
def get_comment_content(comment_url,articleFloor):
commentPage = get_html(comment_url)
commentFloor = 1
soupComment = BeautifulSoup(commentPage,'html.parser')
for item in soupComment.find_all('div',class_='content'):
print(articleFloor,".",item.get_text().strip())#获取糗事内容
for comment in soupComment.find_all(attrs="body"):
print(" ",commentFloor,"楼回复:",comment.get_text())#获取评论内容
commentFloor += 1 def command():
while True:
raw = input("点击enter查看或者输入exit退出,请输入你的选择:")
if raw=='enter':
main()
break
else:
break def main():
article_url_base = 'https://www.qiushibaike.com/8hr/page/%d/'#文章地址
comment_url_base = 'https://www.qiushibaike.com/article/%s'#评论地址
article_url = article_url_base % 2
content = get_html(article_url)
get_comment_link(content,comment_url_base) if __name__ == '__main__':
command()