爬虫爬取到的数据不全
import requests
from lxml import etree
from pymongo.collection import Collection
import pymongo
print("开始0")
class Dangdang(object):
mongo_client=pymongo.MongoClient(host="localhost",port=27017)
dangdang_db=mongo_client["dangdang_db"]
def __init__(self):
self.header = {
"Host": "",
"Connection": "keep-alive",
"Cache-Control": "max-age=0",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9",
"Referer": "/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-2",
"Accept-Encoding": "gzip,deflate",
"Accept-Language": "zh-CN,zh;q=0.9"
}
self.dangdang=Collection(Dangdang.dangdang_db,"dangdang")
def get_dangdang(self, page):
"""发送请求到当当网获取数据"""
url = "/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%s" % page
response = requests.get(url=url, headers=self.header)
if response:
# html数据实例化
# print(response.text)
html1 = etree.HTML(response.content)
items = html1.xpath("//ul[@class='bang_list clearfix bang_list_mode']/li")
return items
def join_list(self,item):
# 处理列表→字符串
return "".join(item)
def parse_item(self,items):
# 解析条目
# 存到mongodb之前的数据
result_list=[]
for item in items:
# 名称
title=item.xpath(".//div[@class='name']/a/@title")
# 图书评论
comment=item.xpath(".//div[@class='star']/a/text()")
# 作者信息
author=item.xpath(".//div[@class='publisher_info'][1]/a[1]/@title")
# 价格
price=item.xpath(".//div[@class='price']/p[1]/span[1]/text()")
result_list.append(
{
"title":self.join_list(title),
"comment":self.join_list(comment),
"author":self.join_list(author),
"price":self.join_list(price)
}
)
return result_list
def insert_data(self,result_list):
self.dangdang.insert_many(result_list)
def main():
d = Dangdang()
print("开始")
import json
for page in range(1, 26):
items = d.get_dangdang(page=page)
result=d.parse_item(items=items)
# print(json.dumps(result))
print(result)
d.insert_data(result)
if __name__ == '__main__':
main()