初始python 之 爬虫:爬取某电影网站信息

时间:2022-01-14 12:56:59

注:此代码仅用于个人爱好学习使用,不涉及任何商业行为!

 话不多说,直接上代码:

#!/user/bin env python
# author:Simple-Sir
# time:2019/7/20 20:36
# 获取电影天堂详细信息
import requests
from lxml import etree

# 伪装浏览器
HEADERS ={
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36',
}
# 定义全局变量
BASE_DOMAIN = 'https://www.dytt8.net'
# 获取首页网页信息并解析
def getUrlText(url,coding):
    respons = requests.get(url,headers=HEADERS)  # 获取网页信息
    # enc = respons.encoding
    # urlText = respons.content.decode('gbk')
    if(coding=='c'):
        urlText = respons.content.decode('gbk')
        html = etree.HTML(urlText)  # 使用lxml解析网页
    else:
        urlText = respons.text
        html = etree.HTML(urlText)  # 使用lxml解析网页
    return html

# 获取电影详情页的href,text解析
def getHref(url):
    html = getUrlText(url,'t')
    aHref = html.xpath('//table[@class="tbspan"]//a/@href')
    htmlAll = map(lambda url:BASE_DOMAIN+url,aHref) # 给每个href补充BASE_DOMAIN
    return htmlAll

# 使用content解析电影详情页,并获取详细信息数据
def getPage(url):
    html = getUrlText(url,'c')
    moveInfo = {}  # 定义电影信息字典
    mName = html.xpath('//div[@class="title_all"]//font[@color="#07519a"]/text()')[0]
    moveInfo['电影名字'] = mName
    mDiv = html.xpath('//div[@id="Zoom"]')[0]
    mImgSrc = mDiv.xpath('.//img/@src')
    moveInfo['海报地址'] = mImgSrc[0]  # 获取海报src地址
    if len(mImgSrc) >= 2:
        moveInfo['电影截图地址'] = mImgSrc[1]  # 获取电影截图src地址
    mContnent = mDiv.xpath('.//text()')
    def pares_info(info,rule):
        '''
        :param info: 字符串
        :param rule: 替换字串
        :return:  指定字符串替换为空,并剔除左右空格
        '''
        return info.replace(rule,'').strip()
    for index,t in enumerate(mContnent):
        if t.startswith('◎译  名'):
            name = pares_info(t,'◎译  名')
            moveInfo['译名']=name
        elif t.startswith('◎片  名'):
            name = pares_info(t,'◎片  名')
            moveInfo['片名']=name
        elif t.startswith('◎年  代'):
            name = pares_info(t,'◎年  代')
            moveInfo['年代']=name
        elif t.startswith('◎产  地'):
            name = pares_info(t,'◎产  地')
            moveInfo['产地']=name
        elif t.startswith('◎类  别'):
            name = pares_info(t,'◎类  别')
            moveInfo['类别']=name
        elif t.startswith('◎语  言'):
            name = pares_info(t,'◎语  言')
            moveInfo['语言']=name
        elif t.startswith('◎字  幕'):
            name = pares_info(t,'◎字  幕')
            moveInfo['字幕']=name
        elif t.startswith('◎上映日期'):
            name = pares_info(t,'◎上映日期')
            moveInfo['上映日期']=name
        elif t.startswith('◎IMDb评分'):
            name = pares_info(t,'◎IMDb评分')
            moveInfo['IMDb评分']=name
        elif t.startswith('◎豆瓣评分'):
            name = pares_info(t,'◎豆瓣评分')
            moveInfo['豆瓣评分']=name
        elif t.startswith('◎文件格式'):
            name = pares_info(t,'◎文件格式')
            moveInfo['文件格式']=name
        elif t.startswith('◎视频尺寸'):
            name = pares_info(t,'◎视频尺寸')
            moveInfo['视频尺寸']=name
        elif t.startswith('◎文件大小'):
            name = pares_info(t,'◎文件大小')
            moveInfo['文件大小']=name
        elif t.startswith('◎片  长'):
            name = pares_info(t,'◎片  长')
            moveInfo['片长']=name
        elif t.startswith('◎导  演'):
            name = pares_info(t,'◎导  演')
            moveInfo['导演']=name
        elif t.startswith('◎编  剧'):
            name = pares_info(t, '◎编  剧')
            writers = [name]
            for i in range(index + 1, len(mContnent)):
                writer = mContnent[i].strip()
                if writer.startswith(''):
                    break
                writers.append(writer)
            moveInfo['编剧'] = writers
        elif t.startswith('◎主  演'):
            name = pares_info(t, '◎主  演')
            actors = [name]
            for i in range(index+1,len(mContnent)):
                actor = mContnent[i].strip()
                if actor.startswith(''):
                    break
                actors.append(actor)
            moveInfo['主演'] = actors
        elif t.startswith('◎标  签'):
            name = pares_info(t,'◎标  签')
            moveInfo['标签']=name
        elif t.startswith('◎简  介'):
            name = pares_info(t,'◎简  介')
            profiles = []
            for i in range(index + 1, len(mContnent)):
                profile = mContnent[i].strip()
                if profile.startswith('◎获奖情况') or '【下载地址】' in profile:
                    break
                profiles.append(profile)
            moveInfo['简介']=profiles
        elif t.startswith('◎获奖情况'):
            name = pares_info(t,'◎获奖情况')
            awards = []
            for i in range(index + 1, len(mContnent)):
                award = mContnent[i].strip()
                if '【下载地址】' in award:
                    break
                awards.append(award)
            moveInfo['获奖情况']=awards
    downUrl = html.xpath('//td[@bgcolor="#fdfddf"]/a/@href')[0]
    moveInfo['下载地址'] = downUrl
    return moveInfo

# 获取前n页所有电影的详情页href
def spider():
    base_url = 'https://www.dytt8.net/html/gndy/dyzz/list_23_{}.html'
    moves = []
    m = int(input('请输入您要获取的开始页:'))
    n = int(input('请输入您要获取的结束页:'))
    print('即将写入第{}页到第{}页的电影信息,请稍后...'.format(m, n))
    for i in range(m,n+1):
        print('******* 第{}页电影 正在写入 ********'.format(i))
        url = base_url.format(i)
        moveHref = getHref(url)
        for index,mhref in enumerate(moveHref):
            print('---- 第{}部电影 正在写入----'.format(index+1))
            move = getPage(mhref)
            moves.append(move)
    # 将电影信息写入本地本件
    for i in moves:
        with open('电影天堂电影信息.txt', 'a+', encoding='utf-8') as f:
            f.write('\n********* {} ***************\n'.format(i['电影名字']))
        for info in i:
            with open('电影天堂电影信息.txt','a+',encoding='utf-8') as f:
                f.write('{}:{}\n'.format(info,i[info]))
    print('写入完成!')

if __name__ == '__main__':
    spider()

执行情况:

初始python 之 爬虫:爬取某电影网站信息

 初始python 之 爬虫:爬取某电影网站信息

 

结果文件:

初始python 之 爬虫:爬取某电影网站信息