仅作学习交流只用。
对于动态网页,可以分析其请求数据,模拟POST,只请求目的数据,占用资源较少,在网络带宽有限的情况下比webbrowser有效。
网站:点击打开链接
下载该网站里的图片。
其中图片列表是动态加载的,从中获得图片的ID,请求对应的网址,下载图片。
主模块:
#-*- coding:utf-8 -*- #lofterart爬虫 #author:windroid #15/3/5 import getPage import downPic import re import time MAXPAGE=184 SPAGE=19 PATH='D:\\lofter\\' print PATH print SPAGE print 'downloading...' for downpage in range(SPAGE,MAXPAGE+1): pagelist=getPage.getPage(downpage) for v in pagelist: #time.sleep(1)#404 downPic.downPic(v[10:],PATH) #print v[10:] open('set.ini','w').write(str(downpage)) print 'download page: '+str(downpage)+' over.' print 'download is over.'
getPage
获得图片列表
#-*- coding: utf-8 -*- import urllib2 import gzip import StringIO import re def getPage(page): '加载页面,返回list,数据格式:productId=23123720' BATCHID=196800-page*13 #if page==1: # page=0 # PARAM2=32 #elif page<=50: # PARAM2=16 #else: # PARAM2=8 PARAM2=8 #c0-param0 1 框画 2 明信片 postdata='''callCount=1 scriptSessionId=${scriptSessionId}187 httpSessionId= c0-scriptName=SaleBean c0-methodName=getSaleRecommendItemList c0-id=0 c0-param0=number:2 c0-param1=number:-1 c0-param2=number:%d c0-param3=number:%d batchId=%d'''%(PARAM2, PARAM2*page, BATCHID) url='http://www.lofter.com/dwr/call/plaincall/SaleBean.getSaleRecommendItemList.dwr' myheaders={ 'Host':'www.lofter.com', 'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0', 'Accept-Encoding':'gzip, deflate', 'Content-Type':'text/plain; charset=UTF-8', 'Referer':'http://www.lofter.com/art/print', } opener=urllib2.build_opener() req=urllib2.Request(url,data=postdata,headers=myheaders) try: f=opener.open(req) except BaseException, err: print 'getPage: '+page+' failed.'+'Error: '+str(err) return {} rawdata=StringIO.StringIO(f.read()) resdata=gzip.GzipFile(fileobj=rawdata).read() reslist=re.findall('productId=\d*',resdata) #print reslist return reslist #getPage(1)#184
downPic
下载图片
# -*- coding: utf-8 -*- import urllib import urllib2 import cookielib import re headers={'User-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:35.0) Gecko/20100101 Firefox/35.0'} opener=urllib2.build_opener(urllib2.HTTPCookieProcessor(cookielib.CookieJar())) def downPic(picid,path): req=urllib2.Request('http://www.lofter.com/art/product-'+picid,headers=headers) try: content=opener.open(req) except BaseException,err: print 'download: '+picid+'.jpg failed. Error: '+str(err) return 1 else: #"showimgtag" result=re.findall('<img src="(.*?)" class="card showimgtag">',content.read()) x=1 for item in result: urllib.urlretrieve(item,path+picid+'-'+str(x)+'.jpg') print 'download: '+picid+'-'+str(x)+'.jpg over.' x+=1 return 0