scrapy 下载图片 from cuiqingcai

时间:2023-03-09 16:48:29
scrapy 下载图片 from cuiqingcai
import scrapy

class MzituScrapyItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
name = scrapy.Field()
image_urls = scrapy.Field()
url = scrapy.Field()
pass

  

官方的:

https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete#scrapy.pipelines.images.ImagesPipeline.item_completed

https://doc.scrapy.org/en/latest/topics/media-pipeline.html?highlight=item_complete

scrapy 下载图片 from cuiqingcai

没有分类,很难看, 再重写一下ImagesPipeline中的file_path方法!

# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
from scrapy import Request
from scrapy.pipelines.images import ImagesPipeline
from scrapy.exceptions import DropItem
import re class MzituScrapyPipeline(ImagesPipeline): def file_path(self, request, response=None, info=None):
"""
:param request: 每一个图片下载管道请求
:param response:
:param info:
:param strip :清洗Windows系统的文件夹非法字符,避免无法创建目录
:return: 每套图的分类目录
"""
item = request.meta['item']
folder = item['name']
folder_strip = strip(folder)
image_guid = request.url.split('/')[-1]
filename = u'full/{0}/{1}'.format(folder_strip, image_guid)
return filename def get_media_requests(self, item, info):
"""
:param item: spider.py中返回的item
:param info:
:return:
"""
for img_url in item['image_urls']:
referer = item['url']
yield Request(img_url, meta={'item': item,
'referer': referer}) def item_completed(self, results, item, info):
image_paths = [x['path'] for ok, x in results if ok]
if not image_paths:
raise DropItem("Item contains no images")
return item # def process_item(self, item, spider):
# return item def strip(path):
"""
:param path: 需要清洗的文件夹名字
:return: 清洗掉Windows系统非法文件夹名字的字符串
"""
path = re.sub(r'[?\\*|“<>:/]', '', str(path))
return path if __name__ == "__main__":
a = '我是一个?\*|“<>:/错误的字符串'
print(strip(a))

  

写一个中间件来处理图片下载的防盗链:

class MeiZiTu(object):

    def process_request(self, request, spider):
'''设置headers和切换请求头
:param request: 请求体
:param spider: spider对象
:return: None
'''
referer = request.meta.get('referer', None)
if referer:
request.headers['referer'] = referer

  

最后一步设置ImagesPipeline的存储目录!

在settings.py中写入:

IMAGES_STORE = 'F:\mzitu\\'

在settings.py中写入以下配置。

# 30 days of delay for images expiration

IMAGES_EXPIRES = 30
ITEM_PIPELINES = {
   'mzitu_scrapy.pipelines.MzituScrapyPipeline': 300,
}

DOWNLOADER_MIDDLEWARES = {

   'mzitu_scrapy.middlewares.MeiZiTu': 543,
}
scrapy 下载图片 from cuiqingcai