scrapy爬取豆瓣电影top250并存储到mysql

时间:2022-09-18 18:33:19

github链接:https://github.com/GeraltLin/scrapy_mysql

1.定义item,只是作为测试用例,就只记录电影名称和评分吧

import scrapy
class MovieTop250Item(scrapy.Item):
    # define the fields for your item here like:
    moviename = scrapy.Field()
    rating = scrapy.Field()
    pass

2.定义Spider,爬取豆瓣电影top250,获取数据

import scrapy
from movie_top250.items import MovieTop250Item
class movieSpider(scrapy.Spider):
    name = 'douban_movie'
    allowed_domain = ['douban.com']
    start_urls = ['https://movie.douban.com/top250', ]

    def parse(self, response):
        yield scrapy.Request(response.url,callback=self.parse_page)
	#获取下一页的链接,并回调parse_page
        for page in response.xpath('//div[@class = "paginator"]/a'):
            link =  response.urljoin(page.xpath('@href').extract()[0])
            yield scrapy.Request(link,callback=self.parse_page)

    def parse_page(self,response):
        for item in response.xpath('//div[@class = "item"]'):
            movie = MovieTop250Item()
            movie['moviename'] = item.xpath('div[2]/div[1]/a/span[1]/text()').extract()[0]
            movie['rating'] = item.xpath('div[2]/div[2]/div[1]/span[2]/text()').extract()[0]
            yield movie
3.定义pipelines,对获取到的数据进行处理(存储到mysql)

需要先下pymysql与twisted这两个操作数据库的包

import pymysql
from twisted.enterprise import adbapi
class MovieTop250Pipeline(object):
    def __init__(self):

	#定义链接的数据库的相关信息
        self.dbpool = adbapi.ConnectionPool('pymysql',
                                            host='127.0.0.1',
                                            db='doubanmovie',
                                            user='root',
                                            passwd='123456',
                                            cursorclass=pymysql.cursors.DictCursor,
                                            charset='utf8',
                                            use_unicode=False)


    def process_item(self, item, spider):
        query = self.dbpool.runInteraction(self._conditional_insert, item)  # 调用插入的方法
        query.addErrback(self._handle_error, item, spider)  # 调用异常处理方法
        return item


    def _conditional_insert(self, tx, item):
        # print item['name']
        sql = "insert into movie(moviename,rating) values(%s,%s)"
        params = (item["moviename"], item["rating"])
        #执行sql语句
        tx.execute(sql, params)


    def _handle_error(self, failure, item, spider):
        print('--------------database operation exception!!-----------------')
        print('-------------------------------------------------------------')
        print(failure)
4.settings设置

BOT_NAME = 'movie_top250'

SPIDER_MODULES = ['movie_top250.spiders']
NEWSPIDER_MODULE = 'movie_top250.spiders'

from faker import Factory
f = Factory.create()
USER_AGENT = f.user_agent()
ROBOTSTXT_OBEY = True

ITEM_PIPELINES = {
   'movie_top250.pipelines.MovieTop250Pipeline': 300,
}

5.结果

运行 scrapy  crawl  douban_movie 

scrapy爬取豆瓣电影top250并存储到mysql