在Scrapy中使用IP池或用户代理更新版(python3)

时间:2021-10-05 16:57:37

middlewares.py

 1 # -*- coding: utf-8 -*-
 2 # 导入随机模块
 3 import random
 4 # 导入有关IP池有关的模块
 5 from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
 6 # 导入有关用户代理有关的模块
 7 from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware
 8 
 9 # IP池
10 class HTTPPROXY(HttpProxyMiddleware):
11     # 初始化 注意一定是 ip=''
12     def __init__(self, ip=''):
13         self.ip = ip
14 
15     def process_request(self, request, spider):
16         item = random.choice(IPPOOL)
17         try:
18             print("当前的IP是:"+item["ipaddr"])
19             request.meta["proxy"] = "http://"+item["ipaddr"]
20         except Exception as e:
21             print(e)
22             pass
23 
24 
25 # 设置IP池
26 IPPOOL = [
27     {"ipaddr": "182.117.102.10:8118"},
28     {"ipaddr": "121.31.102.215:8123"},
29     {"ipaddr": "1222.94.128.49:8118"}
30 ]
31 
32 
33 # 用户代理
34 class USERAGENT(UserAgentMiddleware):
35     #初始化 注意一定是 user_agent=''
36     def __init__(self, user_agent=''):
37         self.user_agent = user_agent
38 
39     def process_request(self, request, spider):
40         item = random.choice(UPPOOL)
41         try:
42             print("当前的User-Agent是:"+item)
43             request.headers.setdefault('User-Agent', item)
44         except Exception as e:
45             print(e)
46             pass
47 
48 
49 # 设置用户代理池
50 UPPOOL = [
51     "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
52 ]

settngs.py中添加一下代码(注意根据项目名修改指向,如这里的工程名是“的demo3”)

1 DOWNLOADER_MIDDLEWARES = {
2     # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
3     # 'demo3.middlewares.HTTPPROXY' : 125,
4     'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
5     'demo3.middlewares.USERAGENT': 1
6 }