1.用户代理池
downloader middleware实现随机更换User-Agent.
fake-useragent库:up to date simple useragent faker with real world database.
github-搜索fake-useragent
https://fake-useragent.herokuapp.com/browsers/0.1.5
以上这个url去查询到所有可用的user-agent。注意:最后这个版本号pypi fake-useragent可查询到最新。
安装:pip install fake-useragent
middlewares.py
from fake_useragent import UserAgent class RandomUserAgentMiddlware(object): """ 随机更换user-agent """ def __init__(self, crawler): super(RandomUserAgentMiddlware, self).__init__() self.ua = UserAgent() self.ua_type = crawler.settings.get("RANDOM_UA_TYPE", "random") @classmethod def from_crawler(cls, crawler): """ 把crawler返回给当前类,这个crawler中包含有一切配置和参数(如settings等) :param crawler: :return: """ return cls(crawler) def process_request(self, request, spider): def get_ua(): # getattr() 函数用于返回一个对象属性值5356 return getattr(self.ua, self.ua_type) request.headers.setdefault("User-Agent", get_ua())
settings.py
# Enable or disable downloader middlewares DOWNLOADER_MIDDLEWARES = { 'picspider.middlewares.RandomUserAgentMiddlware': 543, 'picspider.middlewares.PicspiderDownloaderMiddleware': None, }
#定义控制开关 RANDOM_UA_TYPE = "random"
===========================================================
定义一个main.py,调试(debug)使用。
# -*- coding: utf-8 -*- from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scray","crawl","pic"])