作为一个新世纪有思想有文化有道德时刻准备着的屌丝男青年,在现在这样一个社会中,心疼我大慢播抵制大百度的前提下,没事儿上上网逛逛YY看看斗鱼翻翻美女图片那是必不可少的,可是美图虽多翻页费劲!今天我们就搞个爬虫把美图都给扒下来!本次实例有2个:煎蛋上的妹子图,某网站的rosi图。我只是一个学习python的菜鸟,技术不可耻,技术是无罪的!!!
煎蛋:
先说说程序的流程:获取煎蛋妹子图URL,得到网页代码,提取妹子图片地址,访问图片地址并将图片保存到本地。Ready? 先让我们看看煎蛋妹子网页:
我们得到URL为:http://jandan.net/ooxx/page-1764#comments 1764就是页码, 首先我们要得到最新的页码,然后向前寻找,然后得到每页中图片的url。下面我们分析网站代码写出正则表达式!
根据之前文章的方法我们写出如下函数getNewPage:
1
2
3
4
5
6
7
8
9
|
def __getNewPage( self ):
pageCode = self .Get( self .__Url)
type = sys.getfilesystemencoding()
pattern = re. compile (r '<div .*?cp-pagenavi">.*?<span .*?current-comment-page">\[(.*?)\]</span>' ,re.S)
newPage = re.search(pattern,pageCode.decode( "UTF-8" ).encode( type ))
print pageCode.decode( "UTF-8" ).encode( type )
if newPage ! = None :
return newPage.group( 1 )
return 1500
|
不要问我为什么如果失败返回1500。。。 因为煎蛋把1500页之前的图片都给吃了。 你也可以返回0。接下来是图片的
1
2
3
4
5
6
7
8
|
def __getAllPicUrl( self ,pageIndex):
realurl = self .__Url + "page-" + str (pageIndex) + "#comments"
pageCode = self .Get(realurl)
type = sys.getfilesystemencoding()
pattern = re. compile ( '<p>.*?<a .*?view_img_link">.*?</a>.*?<img src="(.*?)".*?</p>' ,re.S)
items = re.findall(pattern,pageCode.decode( "UTF-8" ).encode( type ))
for item in items:
print item
|
好了,得到了图片地址,接下来就是访问图片地址然后保存图片了:
1
2
3
4
5
6
7
|
def __savePics( self ,img_addr,folder):
for item in img_addr:
filename = item.split( '/' )[ - 1 ]
print "正在保存图片:" + filename
with open (filename, 'wb' ) as file :
img = self .Get(item)
file .write(img)
|
当你觉得信心满满的时候,一定会有一盆冷水浇到你的头上,毕竟程序就是这样,考验你的耐性,打磨你的自信。你测试了一会儿,然后你发现你重启程序后再也无法获取最新页码,你觉得我什么也没动啊为什么会这样。别着急,我们将得到的网页代码打印出来看看:
看到了吧,是服务器感觉你不像浏览器访问的结果把你的ip给屏蔽了。 真是给跪了,辛辛苦苦码一年,屏蔽回到*!那么这个如何解决呢,答:换ip 找代理。接下来我们要改一下我们的HttpClient.py 将里面的opener设置下代理服务器。具体代理服务器请自行百度之,关键字:http代理 。 想找到一个合适的代理也不容易 自己ie Internet选项挨个试试,测试下网速。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
|
# -*- coding: utf-8 -*-
import cookielib, urllib, urllib2, socket
import zlib,StringIO
class HttpClient:
__cookie = cookielib.CookieJar()
__proxy_handler = urllib2.ProxyHandler({ "http" : '42.121.6.80:8080' }) #设置代理服务器与端口
__req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie),__proxy_handler) #生成opener
__req.addheaders = [
( 'Accept' , 'application/javascript, */*;q=0.8' ),
( 'User-Agent' , 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)' )
]
urllib2.install_opener(__req)
def Get( self , url, refer = None ):
try :
req = urllib2.Request(url)
#req.add_header('Accept-encoding', 'gzip')
if not (refer is None ):
req.add_header( 'Referer' , refer)
response = urllib2.urlopen(req, timeout = 120 )
html = response.read()
#gzipped = response.headers.get('Content-Encoding')
#if gzipped:
# html = zlib.decompress(html, 16+zlib.MAX_WBITS)
return html
except urllib2.HTTPError, e:
return e.read()
except socket.timeout, e:
return ''
except socket.error, e:
return ''
|
然后,就可以非常愉快的查看图片了。不过用了代理速度好慢。。。可以设置timeout稍微长一点儿,防止图片下载不下来!
好了,rosi的下篇文章再放!现在是时候上一波代码了:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
|
# -*- coding: utf-8 -*-
import cookielib, urllib, urllib2, socket
import zlib,StringIO
class HttpClient:
__cookie = cookielib.CookieJar()
__proxy_handler = urllib2.ProxyHandler({ "http" : '42.121.6.80:8080' })
__req = urllib2.build_opener(urllib2.HTTPCookieProcessor(__cookie),__proxy_handler)
__req.addheaders = [
( 'Accept' , 'application/javascript, */*;q=0.8' ),
( 'User-Agent' , 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)' )
]
urllib2.install_opener(__req)
def Get( self , url, refer = None ):
try :
req = urllib2.Request(url)
req.add_header( 'Accept-encoding' , 'gzip' )
if not (refer is None ):
req.add_header( 'Referer' , refer)
response = urllib2.urlopen(req, timeout = 120 )
html = response.read()
gzipped = response.headers.get( 'Content-Encoding' )
if gzipped:
html = zlib.decompress(html, 16 + zlib.MAX_WBITS)
return html
except urllib2.HTTPError, e:
return e.read()
except socket.timeout, e:
return ''
except socket.error, e:
return ''
def Post( self , url, data, refer = None ):
try :
#req = urllib2.Request(url, urllib.urlencode(data))
req = urllib2.Request(url,data)
if not (refer is None ):
req.add_header( 'Referer' , refer)
return urllib2.urlopen(req, timeout = 120 ).read()
except urllib2.HTTPError, e:
return e.read()
except socket.timeout, e:
return ''
except socket.error, e:
return ''
def Download( self , url, file ):
output = open ( file , 'wb' )
output.write(urllib2.urlopen(url).read())
output.close()
# def urlencode(self, data):
# return urllib.quote(data)
def getCookie( self , key):
for c in self .__cookie:
if c.name = = key:
return c.value
return ''
def setCookie( self , key, val, domain):
ck = cookielib.Cookie(version = 0 , name = key, value = val, port = None , port_specified = False , domain = domain, domain_specified = False , domain_initial_dot = False , path = '/' , path_specified = True , secure = False , expires = None , discard = True , comment = None , comment_url = None , rest = { 'HttpOnly' : None }, rfc2109 = False )
self .__cookie.set_cookie(ck)
#self.__cookie.clear() clean cookie
# vim : tabstop=2 shiftwidth=2 softtabstop=2 expandtab
HttpClient
|
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
|
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from HttpClient import HttpClient
import sys,re,os
class JianDan(HttpClient):
def __init__( self ):
self .__pageIndex = 1500 #之前的图片被煎蛋吞了
self .__Url = "http://jandan.net/ooxx/"
self .__floder = "jiandan"
def __getAllPicUrl( self ,pageIndex):
realurl = self .__Url + "page-" + str (pageIndex) + "#comments"
pageCode = self .Get(realurl)
type = sys.getfilesystemencoding()
pattern = re. compile ( '<p>.*?<a .*?view_img_link">.*?</a>.*?<img src="(.*?)".*?</p>' ,re.S)
items = re.findall(pattern,pageCode.decode( "UTF-8" ).encode( type ))
for item in items:
print item
self .__savePics(items, self .__floder)
def __savePics( self ,img_addr,folder):
for item in img_addr:
filename = item.split( '/' )[ - 1 ]
print "正在保存图片:" + filename
with open (filename, 'wb' ) as file :
img = self .Get(item)
file .write(img)
def __getNewPage( self ):
pageCode = self .Get( self .__Url)
type = sys.getfilesystemencoding()
pattern = re. compile (r '<div .*?cp-pagenavi">.*?<span .*?current-comment-page">\[(.*?)\]</span>' ,re.S)
newPage = re.search(pattern,pageCode.decode( "UTF-8" ).encode( type ))
print pageCode.decode( "UTF-8" ).encode( type )
if newPage ! = None :
return newPage.group( 1 )
return 1500
def start( self ):
isExists = os.path.exists( self .__floder) #检测是否存在目录
print isExists
if not isExists:
os.mkdir( self .__floder)
os.chdir( self .__floder)
page = int ( self .__getNewPage())
for i in range ( self .__pageIndex,page):
self .__getAllPicUrl(i)
if __name__ = = '__main__' :
jd = JianDan()
jd.start()
JianDan
|