python爬虫之淘宝宝贝图片抓取

时间:2024-02-15 20:39:39

  写在前面的话:家里有人开淘宝店,作为一个小的淘宝店主,经常要做的就是从别人的店铺(当然是批发商)把图片一张一张存下来。然后再自己做ps做好看一点,再上架。这样存图什么的,挺烦人的,刚好最近在学习python,发现这东西,真心的那叫一个方便。

  总的来说,其实也并没有什么技术含量,只是熟悉一下python的语言和正则表达式的使用。

  主要步骤 :

  1、当然是抓取页面html代码 

 1 import urllib
 2 import urllib2
 3 
 4 #获取html代码
 5 def getHtml(url):
 6     request = urllib2.Request(url , headers = headers)
 7     try:
 8         response = urllib2.urlopen(request)
 9         html = response.read()
10         return html
11     except urllib2.URLError,e:
12         print e.reason

 

  2、分析页面中的详情图片部分和主图部分

   淘宝的html页面相当的整齐,可读性不错。很快就可以找到,他们的描述页位置:descUrl  .. location.protocol = \'http:.......\'

可以写一个正则表达式,提取出来 

1 import re
2 
3 #提取描述url
4 def descUrl(html):
5     reg = r"descUrl.*?location.protocol===\'http:\' \? \'//(.*?)\'.?:"
6     desurlre = re.compile(reg,re.I)
7     desurl = re.findall(desurlre , html)
8     return desurl

    再获取这个详情页地址,就可以提取出所有的图片地址了。

1 #提取所有图片
2 def getImglist(html):
3     reg = r\'src=\"(.*?)\"\'
4     imgre = re.compile(reg,re.I)
5     imglist = re.findall(imgre , html)
6     return imglist

 

  3、下载图片

      获取到了图片的url后,当然就是把图片下下来,这里做一个指定路径的保存方法。

因此再加一个创建路径

1 #目录是否存在,不存在则创建
2 def createDir(path):
3     if not os.path.exists(path):
4         os.makedirs(path)
5     else:
6         if os.path.isfile(path):
7             os.mkdir(path)

  保存图片

 1 #保存所有图片
 2 def saveImgTo(imglist , path):
 3     createDir(path)
 4     imgIndex = 1
 5     for imgurl in imglist:
 6         splist = imgurl.split(\'.\')
 7         filetype = splist[len(splist)-1]
 8         print "saving " + imgurl
 9         try:
10             urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + \'.\' + filetype )
11             imgIndex += 1
12             print "==> ok!"
13         except:
14             print "==> err!!!!!!"

 

  以下为一份完整代码,传入存储路径,保存下url.txt 中所有url的淘宝或其他网页图片。新手上路,写的不好的地方轻拍:

  1 #coding=utf-8
  2 
  3 import re
  4 import urllib
  5 import urllib2
  6 import cookielib
  7 import StringIO, gzip
  8 import os
  9 import sys
 10 
 11 headers = {
 12         \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/31.0.1650.63 Safari/537.36\'
 13 }
 14 
 15 
 16 #解压gzip  
 17 def gzdecode(data) :  
 18     compressedstream = StringIO.StringIO(data)  
 19     gziper = gzip.GzipFile(fileobj=compressedstream)    
 20     data2 = gziper.read()   # 读取解压缩后数据   
 21     return data2 
 22     
 23 #获取html代码
 24 def getHtml(url):
 25     request = urllib2.Request(url , headers = headers)
 26     try:
 27         response = urllib2.urlopen(request)
 28         html = response.read()
 29         return html
 30     except urllib2.URLError,e:
 31         print e.reason
 32 
 33 #目录是否存在,不存在则创建
 34 def createDir(path):
 35     if not os.path.exists(path):
 36         os.makedirs(path)
 37     else:
 38         if os.path.isfile(path):
 39             os.mkdir(path)
 40 
 41 #提取描述url
 42 def descUrl(html):
 43     reg = r"descUrl.*?location.protocol===\'http:\' \? \'//(.*?)\'.?:"
 44     desurlre = re.compile(reg,re.I)
 45     desurl = re.findall(desurlre , html)
 46     return desurl
 47 
 48 #提取所有图片
 49 def getImglist(html):
 50     reg = r\'src=\"(.*?)\"\'
 51     imgre = re.compile(reg,re.I)
 52     imglist = re.findall(imgre , html)
 53     return imglist
 54 #提取主图
 55 def getTitleImg(html, path):
 56     createDir(path)
 57     reg = r\'auctionImages.*?\[(.*?)\]\'
 58     imgre = re.compile(reg,re.I)
 59     titleImg = re.findall(imgre , html)
 60     titleImg = titleImg[0]
 61     imglist = titleImg.split(\',\')
 62     titleIndex = 1
 63     for imgurl in imglist:
 64         print "img ==== >  " + imgurl
 65         imgurl = imgurl.strip(\'"\')
 66         imgurl = \'http:\' + imgurl
 67         print imgurl
 68         splist = imgurl.split(\'.\')
 69         filetype = splist[len(splist)-1]
 70         try:
 71                 urllib.urlretrieve(imgurl , path + "/title"+ str(titleIndex) + \'.\' + filetype )
 72                 titleIndex += 1
 73                 print "==> ok!"
 74         except:
 75                print "==> err!!!!!!"
 76 
 77 #保存所有图片
 78 def saveImgTo(imglist , path):
 79     createDir(path)
 80     imgIndex = 1
 81     for imgurl in imglist:
 82         splist = imgurl.split(\'.\')
 83         filetype = splist[len(splist)-1]
 84         print "saving " + imgurl
 85         try:
 86             urllib.urlretrieve(imgurl , path + "/"+ str(imgIndex) + \'.\' + filetype )
 87             imgIndex += 1
 88             print "==> ok!"
 89         except:
 90             print "==> err!!!!!!"
 91 
 92 #从一个淘宝页面,得到详情图片
 93 def getTaoBaoImg(url ,savePath):
 94     html = getHtml(url)
 95     getTitleImg(html , savePath)
 96     desurl = descUrl(html)
 97     desurl = "http://" + desurl[0]
 98     print "desurl = " +  desurl
 99     print "----------------------------------------------------------"
100     #得到淘贝详情html
101     desHtml = getHtml(desurl)
102     imglist = getImglist(desHtml)
103     saveImgTo(imglist , savePath)
104 #-------------------------------------我是华丽的分界线 begin Other-----------------------------------------
105 #提取其他详情图片列表
106 def getOtherImgurllist(html):
107     reg = r\'src="(.*?)"\'
108     desre = re.compile(reg,re.S)
109     imgurllist = re.findall(desre , html)
110     return imgurllist
111     
112 
113 #从其他提取详情图片
114 def getOtherImg(url , savePath):
115     html = getHtml(url)
116     imglist = getOtherImgurllist(html)
117     saveImgTo(imglist , savePath)
118 
119 #提取其他主图
120 def getOthertitleImg(html, savePath):
121     print "todo:"
122 
123 #-------------------------------------我是华丽的分界线 end Other-----------------------------------------
124     
125 #保存原地址
126 def saveUrl(url , savePath):
127     output = open( savePath + "/url.htm" , "w")
128     output.write("""<html>
129 <head>
130 <meta http-equiv="Content-Language" content="zh-CN">
131 <meta HTTP-EQUIV="Content-Type" CONTENT="text/html; charset=gb2312">
132 <meta http-equiv="refresh" content="0.1;url=""" + url + """\">
133 <title></title>
134 </head>
135 <body>
136 </body>
137 </html>""")
138     output.close()
139 
140     
141 savepath = "img"
142 
143 input = open(\'url.txt\', \'r\')
144 
145 urls = input.read( )
146 urls = urls.split(\'\r\n\')
147 print urls
148 
149 if len(sys.argv)>1 and sys.argv[1]:
150     savepath = sys.argv[1]
151 
152 print savepath
153 
154 urlIndex = 1
155 for url in urls:
156     if len(url) < 10:
157         continue
158     urlSavePath = savepath + \'/\' + str(urlIndex)
159     createDir(urlSavePath)
160     saveUrl(url , urlSavePath)
161     print \'*\'*50
162     print url
163     if url.find(\'taobao\') != -1:
164         getTaoBaoImg(url , urlSavePath)
165     else:
166         getOtherImg(url , urlSavePath)
167     urlIndex += 1
168 
169 print "success!"