一 .环境:
python3.6,使用pymysql驱动
二.项目:爬取古诗文网唐诗三百首
前提:在MySQL数据库软件中新建poem数据库,并建立poem_tbl表
表中字段和items中字段一样
(有一个小遗憾,还没学会爬取通过ajax加载的赏析部分的内容)
scrapy startproject poem
cd poem
scrapy genspider poetry gushiwen.org
poetry.py:
1 import scrapy 2 from poem.items import PoemItem 3 4 5 class PoetrySpider(scrapy.Spider): 6 name = 'poetry' 7 allowed_domains = ['gushiwen.org'] 8 start_urls = ['https://so.gushiwen.org/gushi/tangshi.aspx'] 9 10 def parse(self, response): 11 poems = response.xpath("//div[@class='sons']/div[@class='typecont']") 12 for po in poems: 13 item = PoemItem() 14 item['category'] = po.xpath("./div/strong/text()").get() 15 detail_urls = po.xpath(".//span/a/@href").getall() 16 detail_urls = list(map(lambda url: response.urljoin(url), detail_urls)) 17 for detail_url in detail_urls: 18 yield scrapy.Request(url=detail_url, callback=self.parse_detail, meta={"item": item}) 19 20 def parse_detail(self, response): 21 detail_urls = response.xpath("//div[@class='main3']/div[@class='left']") 22 for detail_url in detail_urls: 23 item = response.meta.get("item") 24 item['author'] = detail_url.xpath("./div[@class='sons'][1]//p/a[2]/text()").get() 25 item['title'] = detail_url.xpath("./div[@class='sons'][1]//h1/text()").get() 26 item['article'] = "".join(detail_url.xpath("./div[@class='sons'][1]" 27 "//div[@class='contson']//text()").getall()).strip() 28 item['note'] = "".join(detail_url.xpath("./div[@class='sons'][2]" 29 "/div[@class='contyishang']//p/text()").getall()).strip() 30 yield item
items.py:
1 import scrapy 2 3 class PoemItem(scrapy.Item): 4 # define the fields for your item here like: 5 # name = scrapy.Field() 6 # 类别 7 category = scrapy.Field() 8 # 标题 9 title = scrapy.Field() 10 # 作者 11 author = scrapy.Field() 12 # 内容 13 article = scrapy.Field() 14 # 译文注解 15 note = scrapy.Field()
settings.py:
1 # MySQL数据库设置 2 MYSQL_HOST = '127.0.0.1' 3 MYSQL_DBNAME = 'poem' 4 MYSQL_USER = 'root' 5 MYSQL_PASSWORD = 'root' 6 MYSQL_PORT = 3306
ROBOTSTXT_OBEY = False
DOWNLOAD_DELAY = 1
DEFAULT_REQUEST_HEADERS = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
'Accept-Language': 'en',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.121 Safari/537.36'
}
ITEM_PIPELINES = {
'poem.pipelines.MysqlTwistedPipeline': 1,
}
pipelines.py:
import pymysql from twisted.enterprise import adbapi import pymysql.cursors from scrapy import log # 为了测试数据库连接 class MysqlTwistedPipeline(object): ''' 异步机制将数据写入到mysql数据库中 ''' # 创建初始化函数,当通过此类创建对象时首先被调用的方法 def __init__(self, dbpool): self.dbpool = dbpool # 创建一个静态方法,静态方法的加载内存优先级高于init方法,java的static方法类似, # 在创建这个类的对之前就已将加载到了内存中,所以init这个方法可以调用这个方法产生的对象 @classmethod # 名称固定的 def from_settings(cls, settings): # 先将setting中连接数据库所需内容取出,构造一个地点 params = dict( host=settings["MYSQL_HOST"], db=settings["MYSQL_DBNAME"], user=settings["MYSQL_USER"], passwd=settings["MYSQL_PASSWORD"], charset="utf8", # 游标设置 cursorclass=pymysql.cursors.DictCursor, # 设置编码是否使用Unicode use_unicode=True ) # 通过Twisted框架提供的容器连接数据库,MySQLdb是数据库模块名 dbpool = adbapi.ConnectionPool("pymysql", **params) return cls(dbpool) def process_item(self, item, spider): # 使用Twisted异步的将Item数据插入数据库 query = self.dbpool.runInteraction(self.do_insert, item) log.msg("-------------------连接好了-------------------") query.addErrback(self.handle_error, item, spider) # 这里不往下传入item,spider,handle_error则不需接受,item,spider) def do_insert(self, cursor, item): log.msg("-------------------打印-------------------") # 执行具体的插入语句,不需要commit操作,Twisted会自动进行 insert_sql = """ insert into poem_tbl(title,author,category,article,note ) VALUES(%s,%s,%s,%s,%s) """ cursor.execute(insert_sql, (item["title"], item["author"], item["category"], item["article"], item["note"])) log.msg("-------------------一轮循环完毕-------------------") def handle_error(self, failure, item, spider): # 出来异步插入异常 print(failure)