爬虫入门-5-4.scrapy爬取数据存储mysql

时间:2022-09-18 17:37:41

一 .环境:

python3.6,使用pymysql驱动

二.项目:爬取古诗文网唐诗三百首

前提:在MySQL数据库软件中新建poem数据库,并建立poem_tbl表

        表中字段和items中字段一样 

(有一个小遗憾,还没学会爬取通过ajax加载的赏析部分的内容)

scrapy startproject poem

cd poem

scrapy genspider poetry gushiwen.org

poetry.py:

 1 import scrapy
 2 from poem.items import PoemItem
 3 
 4 
 5 class PoetrySpider(scrapy.Spider):
 6     name = 'poetry'
 7     allowed_domains = ['gushiwen.org']
 8     start_urls = ['https://so.gushiwen.org/gushi/tangshi.aspx']
 9 
10     def parse(self, response):
11         poems = response.xpath("//div[@class='sons']/div[@class='typecont']")
12         for po in poems:
13             item = PoemItem()
14             item['category'] = po.xpath("./div/strong/text()").get()
15             detail_urls = po.xpath(".//span/a/@href").getall()
16             detail_urls = list(map(lambda url: response.urljoin(url), detail_urls))
17             for detail_url in detail_urls:
18                 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item})
19 
20     def parse_detail(self, response):
21         detail_urls = response.xpath("//div[@class='main3']/div[@class='left']")
22         for detail_url in detail_urls:
23             item = response.meta.get("item")
24             item['author'] = detail_url.xpath("./div[@class='sons'][1]//p/a[2]/text()").get()
25             item['title'] = detail_url.xpath("./div[@class='sons'][1]//h1/text()").get()
26             item['article'] = "".join(detail_url.xpath("./div[@class='sons'][1]"
27                                                        "//div[@class='contson']//text()").getall()).strip()
28             item['note'] = "".join(detail_url.xpath("./div[@class='sons'][2]"
29                                                     "/div[@class='contyishang']//p/text()").getall()).strip()
30             yield item

items.py:

 1 import scrapy
 2 
 3 class PoemItem(scrapy.Item):
 4     # define the fields for your item here like:
 5     # name = scrapy.Field()
 6     # 类别
 7     category = scrapy.Field()
 8     # 标题
 9     title = scrapy.Field()
10     # 作者
11     author = scrapy.Field()
12     # 内容
13     article = scrapy.Field()
14     # 译文注解
15     note = scrapy.Field()

settings.py:

1 # MySQL数据库设置
2 MYSQL_HOST = '127.0.0.1'
3 MYSQL_DBNAME = 'poem'
4 MYSQL_USER = 'root'
5 MYSQL_PASSWORD = 'root'
6 MYSQL_PORT = 3306

ROBOTSTXT_OBEY = False

DOWNLOAD_DELAY = 1

DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}

ITEM_PIPELINES = {
'poem.pipelines.MysqlTwistedPipeline': 1,
}

pipelines.py:

import pymysql
from twisted.enterprise import adbapi
import pymysql.cursors
from scrapy import log  # 为了测试数据库连接


class MysqlTwistedPipeline(object):
    '''
    异步机制将数据写入到mysql数据库中
    '''

    # 创建初始化函数,当通过此类创建对象时首先被调用的方法
    def __init__(self, dbpool):
        self.dbpool = dbpool

    # 创建一个静态方法,静态方法的加载内存优先级高于init方法,java的static方法类似,
    # 在创建这个类的对之前就已将加载到了内存中,所以init这个方法可以调用这个方法产生的对象
    @classmethod
    # 名称固定的
    def from_settings(cls, settings):
        # 先将setting中连接数据库所需内容取出,构造一个地点
        params = dict(
            host=settings["MYSQL_HOST"],
            db=settings["MYSQL_DBNAME"],
            user=settings["MYSQL_USER"],
            passwd=settings["MYSQL_PASSWORD"],
            charset="utf8",
            # 游标设置
            cursorclass=pymysql.cursors.DictCursor,
            # 设置编码是否使用Unicode
            use_unicode=True
        )
        # 通过Twisted框架提供的容器连接数据库,MySQLdb是数据库模块名
        dbpool = adbapi.ConnectionPool("pymysql", **params)
        return cls(dbpool)

    def process_item(self, item, spider):
        # 使用Twisted异步的将Item数据插入数据库
        query = self.dbpool.runInteraction(self.do_insert, item)
        log.msg("-------------------连接好了-------------------")
        query.addErrback(self.handle_error, item, spider)  # 这里不往下传入item,spider,handle_error则不需接受,item,spider)

    def do_insert(self, cursor, item):
        log.msg("-------------------打印-------------------")
        # 执行具体的插入语句,不需要commit操作,Twisted会自动进行
        insert_sql = """
             insert into poem_tbl(title,author,category,article,note
                 )
             VALUES(%s,%s,%s,%s,%s)
        """
        cursor.execute(insert_sql, (item["title"], item["author"], item["category"],
                                    item["article"], item["note"]))
        log.msg("-------------------一轮循环完毕-------------------")

    def handle_error(self, failure, item, spider):
        # 出来异步插入异常
        print(failure)