day02 python从入门到放弃----爬取豆瓣电影Top250

时间:2021-10-20 12:24:15

实验环境 python3.7 windows 10

使用到的库

import requests
import re

 

#获取网页源代码
def Get_Data(Url):
    response = requests.get(Url)
    return response
#正则匹配找寻所需数据
def Print_Data(Res):
    Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?)&nbsp.*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S)
    return Data_Temp
#写入文本并输出打印信息
def Save_Data(file_name, file_content):
    top,url,name,director,performer,comment=file_content
    Data = f'''
    ============================
    电影排名:{top}
    电影链接:{url}
    电影名字:{name}
    电影导演:{director}
    电影主演:{performer}
    电影评论:{comment}
    ============================
    \n
    '''
    print(Data)
    with open(file_name.replace('/', '_') + ".txt", "a",encoding='utf-8') as f:
        f.write(Data)
        f.close()
#函数测试实现
Head_Agreement='https'
Domain='movie.douban.com'
Port='443'
File_Name='top250?start='
for Tmp in range(0,226,25):
    Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp)
    Res = Get_Data(Url_Link)
    Data = Print_Data(Res.text)
    for movie in Data:
        Save_Data('movie',movie)
print('Print movie working done!')

下面是完整代码实现

import requests
import re

def Get_Data(Url):
    response = requests.get(Url)
    return response

def Print_Data(Res):
    Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?)&nbsp.*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S)
    return Data_Temp

def Save_Data(file_name, file_content):
    top,url,name,director,performer,comment=file_content
    Data = f'''
    ============================
    电影排名:{top}
    电影链接:{url}
    电影名字:{name}
    电影导演:{director}
    电影主演:{performer}
    电影评论:{comment}
    ============================
    \n
    '''
    print(Data)
    with open(file_name.replace('/', '_') + ".txt", "a",encoding='utf-8') as f:
        f.write(Data)
        f.close()

Head_Agreement='https'
Domain='movie.douban.com'
Port='443'
File_Name='top250?start='
for Tmp in range(0,226,25):
    Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp)
    Res = Get_Data(Url_Link)
    Data = Print_Data(Res.text)
    for movie in Data:
        Save_Data('movie',movie)
print('Print movie working done!')