注:此代码仅用于个人爱好学习使用,不涉及任何商业行为!
话不多说,直接上代码:
#!/user/bin env python # author:Simple-Sir # time:2019/7/20 20:36 # 获取电影天堂详细信息 import requests from lxml import etree # 伪装浏览器 HEADERS ={ 'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36', } # 定义全局变量 BASE_DOMAIN = 'https://www.dytt8.net' # 获取首页网页信息并解析 def getUrlText(url,coding): respons = requests.get(url,headers=HEADERS) # 获取网页信息 # enc = respons.encoding # urlText = respons.content.decode('gbk') if(coding=='c'): urlText = respons.content.decode('gbk') html = etree.HTML(urlText) # 使用lxml解析网页 else: urlText = respons.text html = etree.HTML(urlText) # 使用lxml解析网页 return html # 获取电影详情页的href,text解析 def getHref(url): html = getUrlText(url,'t') aHref = html.xpath('//table[@class="tbspan"]//a/@href') htmlAll = map(lambda url:BASE_DOMAIN+url,aHref) # 给每个href补充BASE_DOMAIN return htmlAll # 使用content解析电影详情页,并获取详细信息数据 def getPage(url): html = getUrlText(url,'c') moveInfo = {} # 定义电影信息字典 mName = html.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0] moveInfo['电影名字'] = mName mDiv = html.xpath('//div[@id="Zoom"]')[0] mImgSrc = mDiv.xpath('.//img/@src') moveInfo['海报地址'] = mImgSrc[0] # 获取海报src地址 if len(mImgSrc) >= 2: moveInfo['电影截图地址'] = mImgSrc[1] # 获取电影截图src地址 mContnent = mDiv.xpath('.//text()') def pares_info(info,rule): ''' :param info: 字符串 :param rule: 替换字串 :return: 指定字符串替换为空,并剔除左右空格 ''' return info.replace(rule,'').strip() for index,t in enumerate(mContnent): if t.startswith('◎译 名'): name = pares_info(t,'◎译 名') moveInfo['译名']=name elif t.startswith('◎片 名'): name = pares_info(t,'◎片 名') moveInfo['片名']=name elif t.startswith('◎年 代'): name = pares_info(t,'◎年 代') moveInfo['年代']=name elif t.startswith('◎产 地'): name = pares_info(t,'◎产 地') moveInfo['产地']=name elif t.startswith('◎类 别'): name = pares_info(t,'◎类 别') moveInfo['类别']=name elif t.startswith('◎语 言'): name = pares_info(t,'◎语 言') moveInfo['语言']=name elif t.startswith('◎字 幕'): name = pares_info(t,'◎字 幕') moveInfo['字幕']=name elif t.startswith('◎上映日期'): name = pares_info(t,'◎上映日期') moveInfo['上映日期']=name elif t.startswith('◎IMDb评分'): name = pares_info(t,'◎IMDb评分') moveInfo['IMDb评分']=name elif t.startswith('◎豆瓣评分'): name = pares_info(t,'◎豆瓣评分') moveInfo['豆瓣评分']=name elif t.startswith('◎文件格式'): name = pares_info(t,'◎文件格式') moveInfo['文件格式']=name elif t.startswith('◎视频尺寸'): name = pares_info(t,'◎视频尺寸') moveInfo['视频尺寸']=name elif t.startswith('◎文件大小'): name = pares_info(t,'◎文件大小') moveInfo['文件大小']=name elif t.startswith('◎片 长'): name = pares_info(t,'◎片 长') moveInfo['片长']=name elif t.startswith('◎导 演'): name = pares_info(t,'◎导 演') moveInfo['导演']=name elif t.startswith('◎编 剧'): name = pares_info(t, '◎编 剧') writers = [name] for i in range(index + 1, len(mContnent)): writer = mContnent[i].strip() if writer.startswith('◎'): break writers.append(writer) moveInfo['编剧'] = writers elif t.startswith('◎主 演'): name = pares_info(t, '◎主 演') actors = [name] for i in range(index+1,len(mContnent)): actor = mContnent[i].strip() if actor.startswith('◎'): break actors.append(actor) moveInfo['主演'] = actors elif t.startswith('◎标 签'): name = pares_info(t,'◎标 签') moveInfo['标签']=name elif t.startswith('◎简 介'): name = pares_info(t,'◎简 介') profiles = [] for i in range(index + 1, len(mContnent)): profile = mContnent[i].strip() if profile.startswith('◎获奖情况') or '【下载地址】' in profile: break profiles.append(profile) moveInfo['简介']=profiles elif t.startswith('◎获奖情况'): name = pares_info(t,'◎获奖情况') awards = [] for i in range(index + 1, len(mContnent)): award = mContnent[i].strip() if '【下载地址】' in award: break awards.append(award) moveInfo['获奖情况']=awards downUrl = html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')[0] moveInfo['下载地址'] = downUrl return moveInfo # 获取前n页所有电影的详情页href def spider(): base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html' moves = [] m = int(input('请输入您要获取的开始页:')) n = int(input('请输入您要获取的结束页:')) print('即将写入第{}页到第{}页的电影信息,请稍后...'.format(m, n)) for i in range(m,n+1): print('******* 第{}页电影 正在写入 ********'.format(i)) url = base_url.format(i) moveHref = getHref(url) for index,mhref in enumerate(moveHref): print('---- 第{}部电影 正在写入----'.format(index+1)) move = getPage(mhref) moves.append(move) # 将电影信息写入本地本件 for i in moves: with open('电影天堂电影信息.txt', 'a+', encoding='utf-8') as f: f.write('\n********* {} ***************\n'.format(i['电影名字'])) for info in i: with open('电影天堂电影信息.txt','a+',encoding='utf-8') as f: f.write('{}:{}\n'.format(info,i[info])) print('写入完成!') if __name__ == '__main__': spider()
执行情况:
结果文件: