Python 爬虫 多进程清洗代理

时间:2021-09-21 03:07:41

利用多线程检测代理网站提供的免费代理是否可用

 1 import requests
 2 from lxml import etree
 3 import time
 4 import multiprocessing
 5 
 6 def get_all_proxy(queue):
 7     url = 'http://www.xicidaili.com/nn/1'
 8     headers = {
 9         'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36',
10     }
11     response = requests.get(url, headers=headers)
12     html_ele = etree.HTML(response.text)
13 
14     ip_eles = html_ele.xpath('//table[@id="ip_list"]/tr/td[2]/text()')
15     port_ele = html_ele.xpath('//table[@id="ip_list"]/tr/td[3]/text()')
16     # proxy_list = []
17     for i in range(0,len(ip_eles)):
18         proxy_str = 'http://' + ip_eles[i] + ':' + port_ele[i]
19         #proxy_list.append(proxy_str)
20         #print(proxy_str)
21         queue.put(proxy_str)
22 
23 def check_one_proxy(proxy):
24     try:
25         #proxy = proxy_and_queue
26         url = 'http://www.baidu.com/s?wd=ip'
27         proxy_dict = {
28             'http': proxy
29         }
30         try:
31             response = requests.get(url, proxies=proxy_dict, timeout=5)
32             if response.status_code == 200:
33                 print(proxy)
34                 return proxy
35             else:
36                 print('bad   '+proxy)
37                 return proxy
38         except:
39             return None
40     except Exception as e:
41         print(e)
42 
43 if __name__ == '__main__':
44     start_time = time.time()
45     # 创建队列
46     q = multiprocessing.Queue()
47     # pool 进程池中, 要用的是下面的这个queue
48     #result_q = multiprocessing.Manager().Queue()
49     # 获取所有代理
50     p = multiprocessing.Process(target=get_all_proxy, args=(q,))
51     p.start()
52     # proxy_list = get_all_proxy()
53     # 检测代理的可用性
54 
55     pool = multiprocessing.Pool(30)
56     result_list = []
57     while True:
58         try:
59             proxy_str = q.get(timeout=5)
60         except:
61             break
62         #print('apply_async 之前')
63         #proxy_and_queue = [proxy_str, result_q]
64         proxy_res = pool.apply_async(check_one_proxy, (proxy_str,))
65         result_list.append(proxy_res)
66     #valid_proxy_list = check_all_proxy(proxy_list)
67 
68     valid_proxy_list = []
69     for proxy_res in result_list:
70         result = proxy_res.get()
71         if result is None:
72             pass
73         else:
74             valid_proxy_list.append(result)
75         #print(result)
76     print('All proxy we can get:')
77     print(valid_proxy_list)
78     pool.close()
79     pool.join()
80     p.join()
81 
82     end_time = time.time()
83     print('--'*30)
84     # print(valid_proxy_list)
85     print('耗时:' + str(end_time-start_time))