scrapy selenium爬取淘宝商品信息并存储到数据库

时间:2024-03-19 12:57:35
  1. 主要内容,官网首页selenium模拟发送搜索关键字,搜索页获取商品链接并进行翻页爬取,其中商品详情页有不同类型的页面,进行不同的抓取方法,处理抓取数据并进行保存。

  2. scrapy.spiders代码如下:

import scrapy
from scrapy import Request
from TaoBao.items import TaobaoItem
import re
class ExampleSpider(scrapy.Spider):

    name = 'taobao'

    def start_requests(self):
        url_str = 'http://www.taobao.com'
        yield Request(url=url_str,callback=self.parse,dont_filter=True,meta={'page':0})

    def parse(self, response):
        price = response.xpath('//div[@class="price g_price g_price-highlight"]/strong/text()').extract()
        for i in price:
            print(i)
        good_urls = response.xpath('//div[@class="row row-2 title"]/a/@href').extract()
        for good_url in good_urls:
            if not re.search('http:', good_url):
                good_url = 'https:' + good_url
                print(good_url)
            yield Request(url=good_url, callback=self.parse_dail, meta={'page': '1'})
        if re.search(r'search',response.url):
            yield Request(url = response.url,callback=self.parse,meta={'page':'2'})

    def parse_dail(self,response):
        if re.search(r'//detail',response.url):
            print('hello')
            good_url = response.url
            goods_name = response.xpath('//div[@class="tb-detail-hd"]/h1/text()').extract()
            shop_name = response.xpath('//div[@class="name"]/a[@class="shopLink"]/text()').extract()
            price = response.xpath('//span[@class="tm-price"]/text()').extract()
            count_num=response.xpath('//span[@class="tm-count"]/text()').extract()
            sales_volumn = count_num[0]
            comments = count_num[1]
            print(''.join(goods_name))
            print(price)
            print(good_url)
            print(sales_volumn)
            print(comments)
            print(''.join(shop_name))
            names = ''.join(goods_name)

            item = TaobaoItem()
            item["good_url"] = good_url
            item["goods_name"]=''.join(names.split())
            item["shop_name"]=''.join(shop_name)
            item["price"]=price[-1]
            item["sales_volumn"]=sales_volumn
            item["comments"]=comments
            yield item
        if re.search(r'//item',response.url):
            good_url = response.url
            goods_name = response.xpath('//h3[@class="tb-main-title"]/text()').extract()
            shop_name = response.xpath('//div[@class="shop-name-wrap"]/a[@class="shop-name-link"]/text()').extract()
            price = response.xpath('//em[@class="tb-rmb-num"]/text()').extract()
            sales_volumn = response.xpath('//strong[@id="J_SellCounter"]/text()').extract()
            comments = response.xpath('//strong[@id="J_RateCounter"]/text()').extract()
            print(''.join(goods_name))
            print(''.join(price))
            print(good_url)
            print(sales_volumn)
            print(comments)
            print(''.join(shop_name))
            names = ''.join(goods_name)

            item = TaobaoItem()
            item["good_url"] = good_url
            item["goods_name"] = ''.join(names.split())
            item["shop_name"] = ''.join(shop_name)
            item["price"] = price[-1]
            item["sales_volumn"] = ''.join(sales_volumn)
            item["comments"] = ''.join(comments)
            yield item

  1. scrapy.items中:
import scrapy

class TaobaoItem(scrapy.Item):

    good_url = scrapy.Field()
    goods_name = scrapy.Field()
    shop_name = scrapy.Field()
    price = scrapy.Field()
    sales_volumn = scrapy.Field()
    comments = scrapy.Field()
  1. scrapy.piplines中:
import sqlite3
class TaobaoPipeline(object):

    def __init__(self):
        self.conn = sqlite3.connect('taobao.db')
        self.cursor = self.conn.cursor()
        self.cursor.execute("create table IF NOT EXISTS taobagoods(good_url varchar(200),goods_name varchar(200),shop_name varchar(500),price varchar(100),sales_volumn varchar(100),comments varchar(100))")

    def process_item(self, item, spider):
        self.cursor.execute("insert into taobagoods values('%s','%s','%s','%s','%s','%s')"%(item["good_url"],item["goods_name"],item["shop_name"],item["price"],item["sales_volumn"],item["comments"]))
        self.conn.commit()
        return item

  1. setting中:
BOT_NAME = 'TaoBao'
ROBOTSTXT_OBEY = False
USER_AGENT = 'TaoBao (+http://www.yourdomain.com)'
SPIDER_MODULES = ['TaoBao.spiders']
NEWSPIDER_MODULE = 'TaoBao.spiders'
SPIDER_MIDDLEWARES = {
   'TaoBao.middlewares.TaobaoSpiderMiddleware': 543,
}
DOWNLOADER_MIDDLEWARES = {
   'TaoBao.middlewares.SeleniumMiddlewares': 543,
}
ITEM_PIPELINES = {
   'TaoBao.pipelines.TaobaoPipeline': 300,
}
  1. 中间件中:
from selenium import webdriver
from selenium.webdriver.firefox.options import Options as FOptions
import time
from scrapy.http import HtmlResponse


class SeleniumMiddlewares(object):

    def __init__(self):
        self.options = FOptions()
        #self.options.add_argument("-headless")
        self.browser = webdriver.Firefox(executable_path="/home/hello/Downloads/geckodriver",firefox_options=self.options)
    def process_request(self,request,spider):
        if int(request.meta['page']) == 0:
            self.browser.get(request.url)
            input_name =self.browser.find_element_by_xpath('//*[@id="q"]')
            input_name.click()
            input_name.send_keys('python')
            btn_seacher = self.browser.find_element_by_xpath('//button[@class="btn-search tb-bg"]')
            btn_seacher.click()
            time.sleep(3)

        if int(request.meta['page']) == 1:
            self.browser.get(request.url)
            time.sleep(3)

        if int(request.meta['page']) == 2:
            self.browser.get(request.url)
            self.browser.execute_script("window.scrollTo(0,document.body.scrollHeight)")
            time.sleep(1)
            next_page = self.browser.find_element_by_xpath('//span[contains(text(),"下一页")]')
            next_page.click()
            time.sleep(2)
        return HtmlResponse(url=self.browser.current_url, body=self.browser.page_source, encoding="utf-8",
                            request=request)
  1. 数据库结果如图:
    scrapy selenium爬取淘宝商品信息并存储到数据库
  2. 代码结构不复杂,未添加太多注释,有疑问的亲们欢迎留言,看到回复哟