代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
|
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @File : HtmlParser.py
# @Author: 赵路仓
# @Date : 2020/3/17
# @Desc :
# @Contact : 398333404@qq.com
import json
from lxml import etree
import requests
from bs4 import BeautifulSoup
url = "https://search.jd.com/Search?keyword=ps4&enc=utf-8&wq=ps4&pvid=cf0158c8664442799c1146a461478c9c"
head = {
'authority' : 'search.jd.com' ,
'method' : 'GET' ,
'path' : '/s_new.php?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=4&s=84&scrolling=y&log_id=1529828108.22071&tpl=3_M&show_items=7651927,7367120,7056868,7419252,6001239,5934182,4554969,3893501,7421462,6577495,26480543553,7345757,4483120,6176077,6932795,7336429,5963066,5283387,25722468892,7425622,4768461' ,
'scheme' : 'https' ,
'referer' : 'https://search.jd.com/Search?keyword=%E6%89%8B%E6%9C%BA&enc=utf-8&qrst=1&rt=1&stop=1&vt=2&wq=%E6%89%8B%E6%9C%BA&cid2=653&cid3=655&page=3&s=58&click=0' ,
'user-agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/66.0.3359.139 Safari/537.36' ,
'x-requested-with' : 'XMLHttpRequest' ,
}
def page(page):
print ( "开始" )
url = "https://search.jd.com/Search?keyword=ps4&enc=utf-8&qrst=1&rt=1&stop=1&vt=1&wq=ps4&page=" + page + "&s=181&click=0"
r = requests.get(url,timeout = 3 ,headers = head)
r.encoding = r.apparent_encoding
# print(r.text)
b = BeautifulSoup(r.text, "html.parser" )
#print(b.prettify())
_element = etree.HTML(r.text)
datas = _element.xpath( '//li[contains(@class,"gl-item")]' )
print (datas)
for data in datas:
p_price = data.xpath( 'div/div[@class="p-price"]/strong/i/text()' )
p_comment = data.xpath( 'div/div[5]/strong/a/text()' )
p_name = data.xpath( 'div/div[@class="p-name p-name-type-2"]/a/em/text()' )
p_href = data.xpath( 'div/div[@class="p-name p-name-type-2"]/a/@href' )
comment = ' ' .join(p_comment)
name = ' ' .join(p_name)
price = ' ' .join(p_price)
href = ' ' .join(p_href)
print (name,price,p_comment,href)
if __name__ = = "__main__" :
page( "5" )
|
爬取结果
以上就是python 爬虫爬取某东ps4售卖情况的详细内容,更多关于python 爬虫的资料请关注服务器之家其它相关文章!
原文链接:https://www.cnblogs.com/zlc364624/p/12874090.html