爬虫爬取到的数据不全

时间:2025-03-17 12:02:09
import requests from lxml import etree from pymongo.collection import Collection import pymongo print("开始0") class Dangdang(object): mongo_client=pymongo.MongoClient(host="localhost",port=27017) dangdang_db=mongo_client["dangdang_db"] def __init__(self): self.header = { "Host": "", "Connection": "keep-alive", "Cache-Control": "max-age=0", "Upgrade-Insecure-Requests": "1", "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9", "Referer": "/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-2", "Accept-Encoding": "gzip,deflate", "Accept-Language": "zh-CN,zh;q=0.9" } self.dangdang=Collection(Dangdang.dangdang_db,"dangdang") def get_dangdang(self, page): """发送请求到当当网获取数据""" url = "/books/bestsellers/01.00.00.00.00.00-24hours-0-0-1-%s" % page response = requests.get(url=url, headers=self.header) if response: # html数据实例化 # print(response.text) html1 = etree.HTML(response.content) items = html1.xpath("//ul[@class='bang_list clearfix bang_list_mode']/li") return items def join_list(self,item): # 处理列表→字符串 return "".join(item) def parse_item(self,items): # 解析条目 # 存到mongodb之前的数据 result_list=[] for item in items: # 名称 title=item.xpath(".//div[@class='name']/a/@title") # 图书评论 comment=item.xpath(".//div[@class='star']/a/text()") # 作者信息 author=item.xpath(".//div[@class='publisher_info'][1]/a[1]/@title") # 价格 price=item.xpath(".//div[@class='price']/p[1]/span[1]/text()") result_list.append( { "title":self.join_list(title), "comment":self.join_list(comment), "author":self.join_list(author), "price":self.join_list(price) } ) return result_list def insert_data(self,result_list): self.dangdang.insert_many(result_list) def main(): d = Dangdang() print("开始") import json for page in range(1, 26): items = d.get_dangdang(page=page) result=d.parse_item(items=items) # print(json.dumps(result)) print(result) d.insert_data(result) if __name__ == '__main__': main()