import requests
import re
def get_page(url):
response = requests.get(url)
#print(response.text)
return response
def parse_index(html):
"""''''''
''''''"""
movie_list = re.findall(
'<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?导演: (.*?)主演: (.*?)<br>(.*?)</p>.*?<span class="rating_num".*?>(.*?)</span>.*?<span>(.*?)人评价</span>.*?<span class="inq">(.*?)</span>',
html,
re.S)
return movie_list
def save_data(movie):
top, m_url, name, daoyan, actor, year_type, \
point, commit, desc = movie
year_type = year_type.strip('\n')
data = f'''
电影名称:{top}
电影url:{m_url}
电影导演:{name}
电影主演:{daoyan}
电影年份:{actor}
电影类型:{year_type}
电影评分:{point}
电影评论:{commit}
电影简介:{desc}
'''
print(data)
with open('douban_top250.txt','a',encoding='utf-8') as f:
f.write(data)
print(f'电影:{name} 写入成功')
if __name__=='__main__':
#拼接所有主页
num = 0
for line in range(10):
url = f'https://movie.douban.com/top250?start={num}&filter='
num += 25
print(url)
index_res = get_page(url)
movie_list = parse_index(index_res.text)
for movie in movie_list:
# print(movie)
# 3.保存数据
save_data(movie)