
import scrapy class MzituScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
image_urls = scrapy.Field()
url = scrapy.Field()
pass
官方的:
https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete#scrapy.pipelines.images.ImagesPipeline.item_completed
https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete
没有分类,很难看, 再重写一下ImagesPipeline中的file_path方法!
# -*- coding: utf-8 -*- # Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import re class MzituScrapyPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None):
"""
:param request: 每一个图片下载管道请求
:param response:
:param info:
:param strip :清洗Windows系统的文件夹非法字符,避免无法创建目录
:return: 每套图的分类目录
"""
item = request.meta['item']
folder = item['name']
folder_strip = strip(folder)
image_guid = request.url.split('/')[-1]
filename = u'full/{0}/{1}'.format(folder_strip, image_guid)
return filename def get_media_requests(self, item, info):
"""
:param item: spider.py中返回的item
:param info:
:return:
"""
for img_url in item['image_urls']:
referer = item['url']
yield Request(img_url, meta={'item': item,
'referer': referer}) def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item # def process_item(self, item, spider):
# return item def strip(path):
"""
:param path: 需要清洗的文件夹名字
:return: 清洗掉Windows系统非法文件夹名字的字符串
"""
path = re.sub(r'[?\\*|“<>:/]', '', str(path))
return path if __name__ == "__main__":
a = '我是一个?\*|“<>:/错误的字符串'
print(strip(a))
写一个中间件来处理图片下载的防盗链:
class MeiZiTu(object): def process_request(self, request, spider):
'''设置headers和切换请求头
:param request: 请求体
:param spider: spider对象
:return: None
'''
referer = request.meta.get('referer', None)
if referer:
request.headers['referer'] = referer
最后一步设置ImagesPipeline的存储目录!
在settings.py中写入:
IMAGES_STORE = 'F:\mzitu\\'
在settings.py中写入以下配置。
# 30 days of delay for images expiration
IMAGES_EXPIRES = 30
ITEM_PIPELINES = {
'mzitu_scrapy.pipelines.MzituScrapyPipeline': 300,
}
DOWNLOADER_MIDDLEWARES = {
'mzitu_scrapy.middlewares.MeiZiTu': 543,
}
