京东商品详细的请求处理,是先显示html,然后再ajax请求处理显示价格。
1.可以运行js,并解析之后得到的html
2.模拟js请求,得到价格
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
|
# -*- coding: utf-8 -*-
"""
根据京东url地址,获取商品价格
京东请求处理过程,先显示html页面,然后通过ajax get请求获取相应的商品价格
1.商品的具体数据在html中的格式,如下(示例)
# product: {
# skuid: 1310118868,
# name: '\u9999\u5f71\u77ed\u88d9\u4e24\u4ef6\u5957\u88c5\u5973\u0032\u0030\u0031\u0034\u51ac\u88c5\u65b0\u6b3e\u97e9\u7248\u957f\u8896\u0054\u6064\u4e0a\u8863\u8377\u53f6\u8fb9\u534a\u8eab\u88d9\u6f6e\u0020\u85cf\u9752\u0020\u004d',
# skuidkey:'7781F505B71CE37A3AFBADA119D3587F',
# href: 'http://item.jd.com/1310118868.html',
# src: 'jfs/t385/197/414081450/336886/3070537b/541be890N2995990c.jpg',
# cat: [1315,1343,1355],
# brand: 18247,
# nBrand: 18247,
# tips: false,
# type: 2,
# venderId:38824,
# shopId:'36786',
# TJ:'0',
# specialAttrs:["is7ToReturn-1"],
# videoPath:'',
# HM:'0'
# }
2.ajax请求代码如下:
# // 获得数字价格
# var getPriceNum = function(skus, $wrap, perfix, callback) {
# skus = typeof skus === 'string' ? [skus]: skus;
# $wrap = $wrap || $('body');
# perfix = perfix || 'J-p-';
# $.ajax({
# url: 'http://p.3.cn/prices/mgets?skuIds=J_' + skus.join(',J_') + '&type=1',
# dataType: 'jsonp',
# success: function (r) {
# if (!r && !r.length) {
# return false;
# }
# for (var i = 0; i < r.length; i++) {
# var sku = r[i].id.replace('J_', '');
# var price = parseFloat(r[i].p, 10);
#
# if (price > 0) {
# $wrap.find('.'+ perfix + sku).html('¥' + r[i].p + '');
# } else {
# $wrap.find('.'+ perfix + sku).html('暂无报价');
# }
#
# if ( typeof callback === 'function' ) {
# callback(sku, price, r);
# }
# }
# }
# });
# };
"""
import urllib
import json
import re
class JdPrice( object ):
"""
对获取京东商品价格进行简单封装
"""
def __init__( self , url):
self .url = url
self ._response = urllib.urlopen( self .url)
self .html = self ._response.read()
def get_product( self ):
"""
获取html中,商品的描述(未对数据进行详细处理,粗略的返回str类型)
:return:
"""
product_re = re. compile (r 'compatible: true,(.*?)};' , re.S)
product_info = re.findall(product_re, self .html)[ 0 ]
return product_info
def get_product_skuid( self ):
"""
通过获取的商品信息,获取商品的skuid
:return:
"""
product_info = self .get_product()
skuid_re = re. compile (r 'skuid: (.*?),' )
skuid = re.findall(skuid_re, product_info)[ 0 ]
return skuid
def get_product_name( self ):
pass
def get_product_price( self ):
"""
根据商品的skuid信息,请求获得商品price
:return:
"""
price = None
skuid = self .get_product_skuid()
url = 'http://p.3.cn/prices/mgets?skuIds=J_' + skuid + '&type=1'
price_json = json.load(urllib.urlopen(url))[ 0 ]
if price_json[ 'p' ]:
price = price_json[ 'p' ]
return price
# 测试代码
if __name__ = = '__main__' :
url = 'http://item.jd.com/1310118868.html'
url = 'http://item.jd.com/1044773.html'
jp = JdPrice(url)
print jp.get_product_price()
# htm.decode('gb2312', 'ignore').encode('utf-8')
# f = open('jjs.html', 'w')
# f.write(htm)
# f.close()
|
再给大家分享一个京东价格的爬虫:
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
|
fromcreepyimportCrawler
fromBeautifulSoupimportBeautifulSoup
importurllib2
importjson
classMyCrawler(Crawler):
defprocess_document( self ,doc):
ifdoc.status = = 200 :
print [ % d] % s % (doc.status,doc.url)
try :
soup = BeautifulSoup(doc.text.decode(gb18030).encode(utf - 8 ))
exceptExceptionase:
printe
soup = BeautifulSoup(doc.text)
printsoup.find( id = "product-intro" ).div.h1.text
url_id = urllib2.unquote(doc.url).decode(utf8).split( / )[ - 1 ].split(.)[ 0 ]
f = urllib2.urlopen(http: / / p. 3.cn / prices / get?skuid = J_ + url_id,timeout = 5 )
price = json.loads(f.read())
f.close()
printprice[ 0 ][p]
else :
pass
crawler = MyCrawler()
crawler.set_follow_mode(Crawler.F_SAME_HOST)
crawler.set_concurrency_level( 16 )
crawler.add_url_filter(.(jpg|jpeg|gif|png|js|css|swf)$)
crawler.crawl(http: / / item.jd.com / 982040.html )
|