笔趣阁是很好爬的网站了,这里简单爬取了全部小说链接和每本的全部章节链接,还想爬取章节内容在biquge.py里在加一个爬取循环,在pipelines.py添加保存函数即可
1 创建一个scrapy项目:crapy startproject biqugetest
2 cd biqugetest
3 生成一个爬虫:scrapy genspider biquge xbiquge.la
4 提取数据:完善spider,使用xpath等方法
5 保存数据:pipeline中保存数据
biquge.py
-- coding: utf-8 --
import scrapy
自定义spider类,继承scrapy.spider
from scrapytest.items import BiqugeItem, BiqugeItem_detail
class BiqugeSpider(scrapy.Spider):
# 爬虫名字
name = 'biquge'
# 允许爬取的范围,防止爬虫爬到别的网站
allowed_domains = ['xbiquge.la']
# 开始爬取的url地址
start_urls = ['http://www.xbiquge.la/xiaoshuodaquan/']
# 数据提取的方法,接受下载中间件传过来的response
def parse(self, response):
# scrapy的response对象可以直接进行xpath
# names = response.xpath('//div[@class="novellist"]//a/text()')
# print("names:%s" % names)
# 获取具体数据文本的方式如下
# 分组
li_list = response.xpath('//div[@class="novellist"]//a')
i = 0
for li in li_list:
# 创建一个数据字典
dict_data = BiqugeItem()
# 利用scrapy封装好的xpath选择器定位元素,并通过extract()或extract_first()来获取结果
dict_data['name'] = li.xpath('.//text()').extract_first() # 书名
dict_data['link'] = li.xpath('.//@href').extract_first() # 书链接
# print(dict_data)
yield dict_data
if i < 2: # 这里限制先爬取一本
yield scrapy.Request(dict_data['link'], callback=self.parse_detail)
i += 1 # 小说计数
def parse_detail(self, response):
# dict_data = response.meta['dict_data']
section_data = BiqugeItem_detail()
section_list = response.xpath('//*[@id="list"]/dl/dd/a')
i = 0
for section in section_list:
section_data['section_link'] = 'http://www.xbiquge.la/' + section.xpath('./@href').extract_first()
section_data['section_name'] = section.xpath('./text()').extract_first()
yield section_data
i += 1 # 章节计数
==============================================
pipelines.py
import json
from scrapytest.items import BiqugeItem_detail, BiqugeItem
class ScrapytestPipeline(object):
# 爬虫文件中提取数据的方法每yield一次item,就会运行一次
# 该方法为固定名称函数
def process_item(self, item, spider):
if isinstance(item, BiqugeItem):
str_data = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(str_data)
return item
# 爬虫开启 , 打开文件 并且只会执行一次
def open_spider(self, spider):
self.file = open('全部小说.csv', 'w')
# 爬虫关闭, 关闭文件
def close_spider(self, spider):
self.file.close()
class BiqugeDetailPipeline(object):
def open_spider(self, spider):
self.file = open('小说章节.csv', 'w')
def process_item(self, item, spider):
if isinstance(item, BiqugeItem_detail):
str_data = json.dumps(dict(item), ensure_ascii=False) + '\n'
self.file.write(str_data)
return item
def close_spider(self, spider):
self.file.close()
==============================================
items.py
import scrapy
设置 爬取的key field
class BiqugeItem(scrapy.Item):
# define the fields for your item here like:
name = scrapy.Field()
link = scrapy.Field()
class BiqugeItem_detail(scrapy.Item):
section_link = scrapy.Field()
section_name = scrapy.Field()
# 如果使用item, key 写错了会报错, 避免手误
# 如果dict , key写错了不会报错, 新增一个key