豆瓣就比较符合这个“明人不说暗话”的原则。所以我们扒豆瓣,不多说,直接上代码
from scrapy import app import re header = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.102 Safari/537.36', 'Host': 'movie.douban.com', 'Accept-Language': 'zh-CN,zh;q=0.9' } movie_url = "https://movie.douban.com/subject/26985127/?from=showing" m_id = re.search("[0-9]+", movie_url).group() # 获取soup对象 soup = app.get_soup(url=movie_url, headers=header, charset="utf-8") content = soup.find(id="content") # 抓取电影名字和上映年份 m_name = content.find("h1").find("span").string m_year = content.find(class_="year").string # 抓取导演 info = content.find(id="info") m_directer = info.find(attrs={"rel": "v:directedBy"}).string # 上映日期 m_date = info.find(attrs={"property": "v:initialReleaseDate"}).string # 类型 types = info.find_all(attrs={"property": "v:genre"}, limit=2) m_types = [] for type_ in types: m_types.append(type_.string) # 抓取主演,只取前面五个 actors = info.find(class_="actor").find_all(attrs={"rel": "v:starring"}, limit=5) m_actors = [] for actor in actors: m_actors.append(actor.string) # 片长 m_time = info.find(attrs={"property": "v:runtime"}).string # m_adaptor = info.select() print("id", m_id, "名称", m_name, "年份 ", m_year, "导演 ", m_directer, "主演", m_actors) print("上映日期", m_date, "类型", m_types, "片长", m_time)
输出:
id 26985127 名称 一出好戏 年份 (2018) 导演 黄渤 主演 ['黄渤', '舒淇', '王宝强', '张艺兴', '于和伟'] 上映日期 2018-08-10(*) 类型 ['剧情', '喜剧'] 片长 134分钟
简单粗暴