基于python的豆瓣“我看过的电影”的爬虫

时间:2022-03-08 19:16:59
#!/usr/bin/env python
# -*- coding: cp936 -*-
# Filename: backup_ver1.py

import urllib2
import re
import sys
import xlwt
import time

wbk=xlwt.Workbook()
sheet1=wbk.add_sheet("my_sheet1")
sheet1.write(0,0,u'影片代码')
sheet1.write(0,1,u'影片名称')
sheet1.write(0,2,u'星级')
sheet1.write(0,3,u'日期')
sheet1.write(0,4,u'标签')
sheet1.write(0,5,u'短评')

reload(sys)
sys.setdefaultencoding( "utf-8" )

# 打印系统初始化界面
print u"""
---------------------------------------
程序:豆瓣爬虫
版本:2.0
作者:anzic
日期:2014-11-13
语言:Python 2.7
功能:按提示输入后提取指定用户的豆列
---------------------------------------
"""
print u'请输入用户代号,回车则为默认'
ainput=raw_input()
if ainput=='':
person=60683287
else:
person=ainput
print u'请输入要打印的页数'
page_num=raw_input()

user_agent = 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'
headers = { 'User-Agent' : user_agent }

i=0
for page in range(1,int(page_num)+1):
myUrl = "http://m.douban.com/movie/people/"+str(person)+"/watched?page="+str(page)
req = urllib2.Request(myUrl,headers = headers)
response = urllib2.urlopen(req)
the_page = response.read()

#f = file(u'抓取网页.html', 'w')
#f.write(the_page)
#f.close

uPage = the_page.decode("utf-8")

#re.findall()返回list
myItems = re.findall('<a href="/movie/subject/.*?<br>.*?</div>',uPage,re.S)
for item in myItems:
# code为电影对应代码
code1=re.findall('\d+',item,re.S)
code=code1[0]

# name为电影名称
item=item.replace(ur':',ur'')
name=re.findall(ur'">(.*?)</a>',item,re.S)

# mark为电影评分
mark1=re.findall(ur'span>\(([1-5])[\u4e00-\u9fa5]',item,re.S)
if len(mark1)>0:
mark=mark1[0]
else:
mark=[]

# date为观看日期
date1=re.findall(ur'<br>(\d{4}-\d{2}-\d{2})',item,re.S)
date=date1[0]

# comment为短评
comment1=re.findall(ur'<br>短评\s(.*?)\s*</div>',item,re.S)
if len(comment1)==0:
comment=[]
else:
comment=comment1[0].decode("utf-8")

# tag为标签
tag1=re.findall(ur'<br>标签\s(.*?)\s*<br>',item,re.S)
if len(tag1)==0:
tag=[]
else:
tag=tag1[0].decode("utf-8")
# 判断提取名称是否出现问题
if len(name)==0:
errorlist.append(code)

# 写入excel
i+=1
sheet1.write(i,0,code)
sheet1.write(i,1,name)
sheet1.write(i,2,mark)
sheet1.write(i,3,date)
sheet1.write(i,4,tag)
sheet1.write(i,5,comment)
print 'Page',page,'is OK'
time.sleep(1)

wbk.save(u"电影导出.xls")