python爬虫小练习——爬取豆瓣电影top250

时间:2024-01-20 19:54:59
import requests from bs4 import BeautifulSoup import pprint import json import pandas as pd import time # 构造分页数字列表 page_indexs = range(0, 250, 25) list(page_indexs) # 请求头 headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/1' } # 下载所有的网页然后交给下一个函数处理 def download_all_htmls(): htmls = [] for idx in page_indexs: url = "https://movie.douban.com/top250?start={}&filter=".format(idx) print("craw html", url) r = requests.get(url, headers=headers) if r.status_code != 200: raise Exception("error") htmls.append(r.text) time.sleep(0.5) return htmls # 解析HTML得到数据 def parse_single_html(html): # 使用BeautifulSoup处理网页,传入参数html,使用html.parser模式处理 soup = BeautifulSoup(html, 'html.parser') # 使用BeautifulSoup匹配想要的内容,使用find函数 article_items = ( soup.find("div", class_="article") .find("ol", class_="grid_view") .find_all("div", class_="item") ) datas = [] # 内容比较多分步提取内容 for article_item in article_items: rank = article_item.find("div", class_="pic").find("em").get_text() info = article_item.find("div", class_="info") title = info.find("div", class_="hd").find("span", class_="title").get_text() stars = ( info.find("div", class_="bd") .find("div", class_="star") .find_all("span") ) rating_star = stars[0]["class"][0] rating_num = stars[1].get_text() comments = stars[3].get_text() datas.append({ "rank": rank, "title": title, "rating_star": rating_star.replace("rating", "").replace("-t", ""), "rating_num": rating_num, "comments": comments.replace("人评价", "") }) return datas pprint.pprint() if __name__ == '__main__': # 下载所有的网页内容 htmls = download_all_htmls() # pprint.pprint(parse_single_html(htmls[0])) # 解析网页内容并追到all_datas的列表中 all_datas = [] for html in htmls: all_datas.extend(parse_single_html(html)) # 使用pandas模块,批量导入到表格中 df = pd.DataFrame(all_datas) df.to_excel("doubanTOP250.xlsx")