middlewares.py
1 # -*- coding: utf-8 -*- 2 # 导入随机模块 3 import random 4 # 导入有关IP池有关的模块 5 from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware 6 # 导入有关用户代理有关的模块 7 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware 8 9 # IP池 10 class HTTPPROXY(HttpProxyMiddleware): 11 # 初始化 注意一定是 ip='' 12 def __init__(self, ip=''): 13 self.ip = ip 14 15 def process_request(self, request, spider): 16 item = random.choice(IPPOOL) 17 try: 18 print("当前的IP是:"+item["ipaddr"]) 19 request.meta["proxy"] = "http://"+item["ipaddr"] 20 except Exception as e: 21 print(e) 22 pass 23 24 25 # 设置IP池 26 IPPOOL = [ 27 {"ipaddr": "182.117.102.10:8118"}, 28 {"ipaddr": "121.31.102.215:8123"}, 29 {"ipaddr": "1222.94.128.49:8118"} 30 ] 31 32 33 # 用户代理 34 class USERAGENT(UserAgentMiddleware): 35 #初始化 注意一定是 user_agent='' 36 def __init__(self, user_agent=''): 37 self.user_agent = user_agent 38 39 def process_request(self, request, spider): 40 item = random.choice(UPPOOL) 41 try: 42 print("当前的User-Agent是:"+item) 43 request.headers.setdefault('User-Agent', item) 44 except Exception as e: 45 print(e) 46 pass 47 48 49 # 设置用户代理池 50 UPPOOL = [ 51 "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393" 52 ]
settngs.py中添加一下代码(注意根据项目名修改指向,如这里的工程名是“的demo3”)
1 DOWNLOADER_MIDDLEWARES = { 2 # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123, 3 # 'demo3.middlewares.HTTPPROXY' : 125, 4 'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2, 5 'demo3.middlewares.USERAGENT': 1 6 }