python爬虫爬取淘宝商品信息(selenum+phontomjs)

时间:2022-05-03 09:38:57

本文实例为大家分享了python爬虫爬取淘宝商品的具体代码,供大家参考,具体内容如下

1、需求目标

进去淘宝页面,搜索耐克关键词,抓取 商品的标题,链接,价格,城市,旺旺号,付款人数,进去第二层,抓取商品的销售量,款号等。

python爬虫爬取淘宝商品信息(selenum+phontomjs)

python爬虫爬取淘宝商品信息(selenum+phontomjs)

python爬虫爬取淘宝商品信息(selenum+phontomjs)

2、结果展示

python爬虫爬取淘宝商品信息(selenum+phontomjs)

3、源代码

?
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
# encoding: utf-8
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import time
import pandas as pd
time1=time.time()
from lxml import etree
from selenium import webdriver
#########自动模拟
driver=webdriver.phantomjs(executable_path='d:/python27/scripts/phantomjs.exe')
import re
 
#################定义列表存储#############
title=[]
price=[]
city=[]
shop_name=[]
num=[]
link=[]
sale=[]
number=[]
 
#####输入关键词耐克(这里必须用unicode)
keyword="%e8%80%90%e5%85%8b"
 
 
for i in range(0,1):
 
  try:
    print "...............正在抓取第"+str(i)+"页..........................."
 
    url="https://s.taobao.com/search?q=%e8%80%90%e5%85%8b&imgfile=&js=1&stats_click=search_radio_all%3a1&initiative_id=staobaoz_20170710&ie=utf8&bcoffset=4&ntoffset=4&p4ppushleft=1%2c48&s="+str(i*44)
    driver.get(url)
    time.sleep(5)
    html=driver.page_source
 
    selector=etree.html(html)
    title1=selector.xpath('//div[@class="row row-2 title"]/a')
    for each in title1:
      print each.xpath('string(.)').strip()
      title.append(each.xpath('string(.)').strip())
 
 
    price1=selector.xpath('//div[@class="price g_price g_price-highlight"]/strong/text()')
    for each in price1:
      print each
      price.append(each)
 
 
    city1=selector.xpath('//div[@class="location"]/text()')
    for each in city1:
      print each
      city.append(each)
 
 
    num1=selector.xpath('//div[@class="deal-cnt"]/text()')
    for each in num1:
      print each
      num.append(each)
 
 
    shop_name1=selector.xpath('//div[@class="shop"]/a/span[2]/text()')
    for each in shop_name1:
      print each
      shop_name.append(each)
 
 
    link1=selector.xpath('//div[@class="row row-2 title"]/a/@href')
    for each in link1:
      kk="https://" + each
 
 
      link.append("https://" + each)
      if "https" in each:
        print each
 
        driver.get(each)
      else:
        print "https://" + each
        driver.get("https://" + each)
      time.sleep(3)
      html2=driver.page_source
      selector2=etree.html(html2)
 
      sale1=selector2.xpath('//*[@id="j_detailmeta"]/div[1]/div[1]/div/ul/li[1]/div/span[2]/text()')
      for each in sale1:
        print each
        sale.append(each)
 
      sale2=selector2.xpath('//strong[@id="j_sellcounter"]/text()')
      for each in sale2:
        print each
        sale.append(each)
 
      if "tmall" in kk:
        number1 = re.findall('<ul id="j_attrul">(.*?)</ul>', html2, re.s)
        for each in number1:
          m = re.findall('>*号: (.*?)</li>', str(each).strip(), re.s)
          if len(m) > 0:
            for each1 in m:
              print each1
              number.append(each1)
 
          else:
            number.append("null")
 
      if "taobao" in kk:
        number2=re.findall('<ul class="attributes-list">(.*?)</ul>',html2,re.s)
        for each in number2:
          h=re.findall('>*号: (.*?)</li>', str(each).strip(), re.s)
          if len(m) > 0:
            for each2 in h:
              print each2
              number.append(each2)
 
          else:
            number.append("null")
 
      if "click" in kk:
        number.append("null")
 
  except:
    pass
 
 
print len(title),len(city),len(price),len(num),len(shop_name),len(link),len(sale),len(number)
 
# #
# ######数据框
data1=pd.dataframe({"标题":title,"价格":price,"旺旺":shop_name,"城市":city,"付款人数":num,"链接":link,"销量":sale,"款号":number})
print data1
# 写出excel
writer = pd.excelwriter(r'c:\\taobao_spider2.xlsx', engine='xlsxwriter', options={'strings_to_urls': false})
data1.to_excel(writer, index=false)
writer.close()
 
time2 = time.time()
print u'ok,爬虫结束!'
print u'总共耗时:' + str(time2 - time1) + 's'
####关闭浏览器
driver.close()

以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。

原文链接:http://blog.csdn.net/u013421629/article/details/74960278