github链接:https://github.com/GeraltLin/scrapy_mysql
1.定义item,只是作为测试用例,就只记录电影名称和评分吧
import scrapy class MovieTop250Item(scrapy.Item): # define the fields for your item here like: moviename = scrapy.Field() rating = scrapy.Field() pass
2.定义Spider,爬取豆瓣电影top250,获取数据
import scrapy
from movie_top250.items import MovieTop250Item
class movieSpider(scrapy.Spider):
name = 'douban_movie'
allowed_domain = ['douban.com']
start_urls = ['https://movie.douban.com/top250', ]
def parse(self, response):
yield scrapy.Request(response.url,callback=self.parse_page)
#获取下一页的链接,并回调parse_page
for page in response.xpath('//div[@class = "paginator"]/a'):
link = response.urljoin(page.xpath('@href').extract()[0])
yield scrapy.Request(link,callback=self.parse_page)
def parse_page(self,response):
for item in response.xpath('//div[@class = "item"]'):
movie = MovieTop250Item()
movie['moviename'] = item.xpath('div[2]/div[1]/a/span[1]/text()').extract()[0]
movie['rating'] = item.xpath('div[2]/div[2]/div[1]/span[2]/text()').extract()[0]
yield movie
3.定义pipelines,对获取到的数据进行处理(存储到mysql)
需要先下pymysql与twisted这两个操作数据库的包
import pymysql
from twisted.enterprise import adbapi
class MovieTop250Pipeline(object):
def __init__(self):
#定义链接的数据库的相关信息
self.dbpool = adbapi.ConnectionPool('pymysql',
host='127.0.0.1',
db='doubanmovie',
user='root',
passwd='123456',
cursorclass=pymysql.cursors.DictCursor,
charset='utf8',
use_unicode=False)
def process_item(self, item, spider):
query = self.dbpool.runInteraction(self._conditional_insert, item) # 调用插入的方法
query.addErrback(self._handle_error, item, spider) # 调用异常处理方法
return item
def _conditional_insert(self, tx, item):
# print item['name']
sql = "insert into movie(moviename,rating) values(%s,%s)"
params = (item["moviename"], item["rating"])
#执行sql语句
tx.execute(sql, params)
def _handle_error(self, failure, item, spider):
print('--------------database operation exception!!-----------------')
print('-------------------------------------------------------------')
print(failure)
4.settings设置
BOT_NAME = 'movie_top250' SPIDER_MODULES = ['movie_top250.spiders'] NEWSPIDER_MODULE = 'movie_top250.spiders' from faker import Factory f = Factory.create() USER_AGENT = f.user_agent() ROBOTSTXT_OBEY = True ITEM_PIPELINES = { 'movie_top250.pipelines.MovieTop250Pipeline': 300, }
5.结果
运行 scrapy crawl douban_movie