python 爬虫一键爬取淘宝天猫宝贝页面主图颜色图和详情图的教程

实例如下所示：

100

101

102

103

104

105

106

107

108

109

110

111

112

113

114

115

116

117

118

119

120

121

122

123

124

125

126

127

128

129

									import requests

									import re,sys,os

									import json

									import threading

									import pprint

									class spider:

									 def __init__(self,sid,name):

									 self.id = sid

									 self.headers = { "Accept":"text/html,application/xhtml+xml,application/xml;",

									  "Accept-Encoding":"gzip",

									  "Accept-Language":"zh-CN,zh;q=0.8",

									  "Referer":"http://www.example.com/",

									  "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.90 Safari/537.36"

									  }

									 self.name=name

									 def openurl(self,url):

									 self.request = requests.get(url,headers = self.headers) 

									 if self.request.ok:

									  return self.request.text

									 def matchs(self):

									 tmall_exp = r"Setup\(([\s\S]+?)\);"### 匹配商品数据的正则

									 detail= r"src=\"(https://img\S+?[jpgifn]+?)\"" ###匹配 商品详情图的正则

									 html = self.openurl("https://detail.tmall.com/item.htm?id=%s"%self.id)

									 data = re.findall(tmall_exp,html)

									 data = json.loads(data[0])

									 main_img = data['propertyPics'] ## 这里包括了主图和颜色图的地址

									 color_data =data['valItemInfo'] ['skuList'] ### 这里获得商品的颜色信息列表 包括颜色编码 颜色名称,商品skuID

									 detail_html = self.openurl("http:"+data['api']["httpsDescUrl"])

									 detail_image = re.findall(detail,detail_html)

									 self.newdata={"MAIN":main_img['default'],"DETAIL":detail_image,"id":self.id,}

									 psvs = []

									 self.newdata['COLOR']=[]

									 for v in range(len(color_data)):

									  if ";"in color_data[v]["pvs"]:

									  psv = color_data[v]['pvs'][color_data[v]['pvs'].find(";")+1:]

									  else:

									  psv = color_data[v]['pvs']

									  if psv in psvs:

									  continue

									  psvs.append(psv)

									  self.newdata['COLOR'].append({color_data[v]["names"]:main_img[";"+psv+";"]})

									 pprint.pprint(self.newdata)

									 return self.newdata

									 def download(self):

									 if len(self.newdata)>0:

									  for x in range(len(self.newdata['MAIN'])):

									  threading.Thread(target=self.download_main,args=(self.newdata['MAIN'][x],x)).start()

									  for x in self.newdata['COLOR']:

									  threading.Thread(target=self.download_color,args=(x,)).start()

									  for x in range(len(self.newdata['DETAIL'])):

									  threading.Thread(target=self.download_detail,args=(self.newdata['DETAIL'][x],x)).start()

									 return

									 def download_main(self,url,index):

									 try:

									  img = requests.get("http:"+url,stream=True,headers = self.headers,timeout=10)

									 except:

									  print(sys.exc_info())

									  return

									 if img.ok:

									  if not os.path.exists(self.name+"/main"):

									  try:

									   os.makedirs(self.name+"/main")

									  except:

									   pass

									  imgs = open(self.name+"/main/%s.jpg"%index,"wb")

									  imgs.write(img.content)

									  imgs.close()

									 def download_color(self,url):

									 try:

									  img = requests.get("http:"+url[list(url.keys())[0]][0],stream=True,headers = self.headers,timeout=10)

									 except:

									  print(sys.exc_info())

									  return

									 if img.ok:

									  if not os.path.exists(self.name+"/color"):

									  try:

									   os.makedirs(self.name+"/color")

									  except:

									   pass

									  if "/"in list(url.keys())[0]:

									  color = list(url.keys())[0].replace("/","_")

									  elif "\\" in list(url.keys())[0]:

									  color = list(url.keys())[0].replace("\\","_")

									  else:

									  color = list(url.keys())[0]

									  imgs = open(self.name+"/color/%s.jpg"%color,"wb")

									  imgs.write(img.content)

									  imgs.close()

									 def download_detail(self,url,index):

									 try:

									  img = requests.get(url,stream=True,headers = self.headers,timeout=10)

									 except:

									  print(sys.exc_info())

									  return

									 if img.ok:

									  if not os.path.exists(self.name+"/detail"):

									  try:

									   os.makedirs(self.name+"/detail")

									  except:

									   pass

									  imgs = open(self.name+"/detail/%s.jpg"%index,"wb")

									  imgs.write(img.content)

									  imgs.close()

									if __name__ =="__main__":

									 sid = 528766269341 ## 这里输入天猫宝贝ID

									 taobao = spider(sid,"下载图片/T")

									 taobao.matchs()

									 taobao.download()

以上这篇python 爬虫一键爬取淘宝天猫宝贝页面主图颜色图和详情图的教程就是小编分享给大家的全部内容了，希望能给大家一个参考，也希望大家多多支持服务器之家。

原文链接：https://blog.csdn.net/mzbqhbc12/article/details/54992954

秒客网

python 爬虫一键爬取淘宝天猫宝贝页面主图颜色图和详情图的教程

相关文章

python 爬虫一键爬取 淘宝天猫宝贝页面主图颜色图和详情图的教程

相关文章

python 爬虫一键爬取淘宝天猫宝贝页面主图颜色图和详情图的教程