python网络爬虫系列(四) --- 批量抓取并保存图片

时间:2022-11-13 20:39:27
# -*- coding:utf-8 -*-
import re
import random
import socket
import urllib2
import cookielib
import urllib
import thread
import time
import string
import os

import sys
reload(sys)
sys.setdefaultencoding('utf8')



class Spider:

file_path = "D://pachong//"
city_url = 'http://www.58.com/ershouche/changecity/'
sleep_download_time = 10

def __init__(self):
self.siteURL = 'http://city.58.com/bentian/page/'
socket.setdefaulttimeout(20)

def getAllcity(self):
page = self.getPage(self.city_url)
#onclick="co('nb')">
if page == 0:
return 0
else:
pattern = re.compile(r'onclick="co(.*?)">')
items = re.findall(pattern,page)
if len(items) > 0:
for item in items:
print item
return items

def getPage(self, _url):
url = _url
try:
#time.sleep(self.sleep_download_time)
request = urllib2.Request(url)
response=urllib2.urlopen(url)
except urllib2.URLError,e:
print e.reason + 'akuarius1'
return 0
except urllib2.HTTPError,e:
print e.reason + 'akuarius2'
return 0
except IOError as e:
print e.reason + 'akuarius3'
return 0

return response.read()

def saveImg(self,imageURL,fileName):
print imageURL
fileName = self.file_path + fileName + '.' + 'bmp'
print fileName
u = urllib.urlopen(imageURL)
data = u.read()
f = open(fileName, 'wb')
f.write(data)
print u"正在悄悄保存她的一张图片为",fileName
f.close()

def getImage(self, page):
pattern = re.compile(r'<img id="bigimg1" src=(.*?) onload')
items = re.findall(pattern,page)
if len(items) > 0:
return items[0].replace('\'', '')
else:
return 0

def getCarNmae(self, page):
pattern = re.compile(r'<title>【图】 (.*?)款')
items = re.findall(pattern,page)
if len(items) > 0:
return items[0].decode('utf-8').replace(' ','')
else:
return 0


def createUrl(self, page_index, city):
url = self.siteURL.replace('page', 'pn'+ str(page_index))
city = city.replace("('", "")
city = city.replace("')", "")
url = url.replace('city', city)
return url

def getContents(self):
citys = self.getAllcity()
if citys == 0:
return 0
count = 0
for city in citys:
index = 1
while 1:
index = index + 1
current_url = self.createUrl(index, city)
print current_url
page = self.getPage(current_url)
if page == 0:
continue
else:
print "haha"
ret = re.findall(r'以上本地信息更新较少,其他城市不妨考虑一下', page, re.M|re.I)
pattern = re.compile(r'<a href="(.*?)" target="_blank" class="t"',re.M|re.I)
if ret:
print "该地区该种类搜索完毕"
break
else:
items = re.findall(pattern,page)
for item in items:
print item
car_page = self.getPage(item)
image_url = self.getImage(car_page)
if image_url == 0:
continue
image_name = self.getCarNmae(car_page)
if image_name == 0:
continue
count = count + 1
image_name = image_name + '___'+ str(count)
self.saveImg(image_url, image_name)



spider = Spider()
spider.getContents()