实验环境 python3.7 windows 10
使用到的库
import requests
import re
#获取网页源代码 def Get_Data(Url): response = requests.get(Url) return response
#正则匹配找寻所需数据 def Print_Data(Res): Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?) .*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S) return Data_Temp
#写入文本并输出打印信息 def Save_Data(file_name, file_content): top,url,name,director,performer,comment=file_content Data = f''' ============================ 电影排名:{top} 电影链接:{url} 电影名字:{name} 电影导演:{director} 电影主演:{performer} 电影评论:{comment} ============================ \n ''' print(Data) with open(file_name.replace('/', '_') + ".txt", "a",encoding='utf-8') as f: f.write(Data) f.close()
#函数测试实现 Head_Agreement='https' Domain='movie.douban.com' Port='443' File_Name='top250?start=' for Tmp in range(0,226,25): Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp) Res = Get_Data(Url_Link) Data = Print_Data(Res.text) for movie in Data: Save_Data('movie',movie) print('Print movie working done!')
下面是完整代码实现
import requests import re def Get_Data(Url): response = requests.get(Url) return response def Print_Data(Res): Data_Temp = re.findall('<div class="item">.*?<em class="">(.*?)</em>.*?<a href="(.*?)">.*?<span class="title">(.*?)</span>.*?<p class="">.*?导演: (.*?) .*?主演: (.*?)<br>.*?<span class="rating_num" property="v:average">(.*?)</span>',Res,re.S) return Data_Temp def Save_Data(file_name, file_content): top,url,name,director,performer,comment=file_content Data = f''' ============================ 电影排名:{top} 电影链接:{url} 电影名字:{name} 电影导演:{director} 电影主演:{performer} 电影评论:{comment} ============================ \n ''' print(Data) with open(file_name.replace('/', '_') + ".txt", "a",encoding='utf-8') as f: f.write(Data) f.close() Head_Agreement='https' Domain='movie.douban.com' Port='443' File_Name='top250?start=' for Tmp in range(0,226,25): Url_Link = Head_Agreement+'://'+Domain+':'+Port+'/'+File_Name+str(Tmp) Res = Get_Data(Url_Link) Data = Print_Data(Res.text) for movie in Data: Save_Data('movie',movie) print('Print movie working done!')