python爬虫(爬取唯品会)

时间:2025-01-17 10:34:39
import json

import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from time import sleep


class VipSpider(object):
    def __init__(self, url, search, start_page, end_page):
         = url
         = search
        self.start_page = start_page
        self.end_page = end_page
         = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36"}
         = ()

    def handle_click(self):
        ()
        .find_elements_by_xpath("//*[@id='J_main_nav_link']/li[13]/a")[0].click()
        sleep(2)
        .find_elements_by_xpath("//*[@id='J-search']/div[1]/input")[0].send_keys()
        sleep(2)
        .find_elements_by_xpath("//*[@id='J-search']/div[1]/a/span")[0].click()
        sleep(3)

    def handle_url(self, page):
        Durl = .current_url  # "/?keyword=%E7%AF%AE%E7%90%83&ff=235|12|1|1"
        index = ("&")
        Durl = Durl[:index]
        data = {
            "page": page
        }
        res = (url=Durl, params=data, headers=)
        newurl = 
        print(newurl)
        return newurl

    def scroll_page(self, req):
        (req)
        sleep(3)
        for x in range(20):
            js = "var q==10000"
            .execute_script(js)  # 执行脚本(滚动)
            sleep(5)
        html = .page_source

        return html

    def download(self, request):
        soup = BeautifulSoup(request, "lxml")
        SectionList = ("section#J_searchCatList")[0]
        GoodsList = ("-goods")
        items = []
        for div in GoodsList:
            item = {}
            imageslink = ["data-original"]
            title = ("-info a")[0].get_text()
            discount = ("-info span")[0].get_text()
            pricewra = ("-info em")[0].get_text()
            marprice = ("-info -market-price ")[0].get_text()
            item["图片链接"] = imageslink
            item["商品名称"] = title
            item["商品折扣"] = discount
            item["特卖价格"] = pricewra
            item["原始价格"] = marprice
            (item)
        return items

    def startSpider(self):
        htmlList = []
        for page in range(int(self.start_page), int(self.end_page) + 1):
            if page == 1:
                self.handle_click()
                req = self.handle_url(page)
                newhtml = self.scroll_page(req)
                htmlList += (newhtml)
            else:
                req = self.handle_url(page)
                newhtml = self.scroll_page(req)
                htmlList += (newhtml)
        # 【数据的存储】写入json数据
        # 将列表转化成json字符串
        string = (htmlList)
        with open("", "w", encoding="utf-8") as fp:
            (string)


def main():
    url = "/"
    search = input("请输入你要搜索的商品:")
    start_page = input("请输入你要爬取的起始页:")
    end_page = input("请输入你要爬取的结束页:")
    spider = VipSpider(url, search, start_page, end_page)
    ()


if __name__ == '__main__':
    main()