import re
import requests
import threading
from queue import Queue
from lxml import etree
class ProxyIp:
@classmethod
def get_proxy_ip(cls):
url1 = "http://www.youdaili.net/"
href_queue = Queue() # 到该网站找到需要抓取ip的链接
ip_queue = Queue() # 网站上能找到的ip
ip_valid = [] # 有效的代理
url1_resp = requests.get(url1)
content = url1_resp.content.decode()
html = etree.HTML(content)
box = html.xpath("//div[@class='m_box2']/ul/li/a")
for i in range(24):
href_queue.put(box[i].get("href")) # 将链接放到队列里
while not href_queue.empty(): # 获取所有代理
ip_href = href_queue.get()
try:
resp = requests.get(ip_href)
content = resp.text
html = etree.HTML(content)
ips = html.xpath("//div[@class='cont_font']/p/span[1]")
for ip in ips[0].itertext():
ip_queue.put(ip)
except Exception as e:
print(e)
break
checkth = []
print("Starting Checking...")
for i in range(30): # 使用多线程进行验证是否可用
checkth.append(CheckThreads(ip_queue, ip_valid))
for th in checkth:
th.start()
for th in checkth:
th.join()
ip_valid = list(set(ip_valid))
return ip_valid
class CheckThreads(threading.Thread):
mutex = threading.Lock()
def __init__(self, ip_queue, ip_valid):
super(CheckThreads, self).__init__()
self.ip_queue = ip_queue
self.ip_valid = ip_valid
def run(self):
while not self.ip_queue.empty():
try:
ip_re = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\:\d{2,5})')
ip = self.ip_queue.get()
ip_proxy = ip_re.search(ip).group()
proxies = {"http": ip_proxy}
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Encoding": "gzip, deflate",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Host": "www.1356789.com",
"Pragma": "no-cache",
"Referer": "http://m.ip138.com/",
}
resp = requests.get(url="http://www.1356789.com/", proxies=proxies, headers=headers, timeout=10)
if resp.status_code == 200:
page = resp.text
ip_url = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', page).group()
ip_now = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', ip_proxy).group()
if ip_url == ip_now: # 判断是否是匿名代理
self.ip_valid.append(ip_proxy + " 匿名代理")
print(ip_proxy + " 匿名代理")
else:
self.ip_valid.append(ip_proxy + " 透明代理")
print(ip_proxy + " 透明代理")
except Exception:
print(ip_proxy + " 该代理ip无效或响应过慢!")
if __name__ == "__main__":
ip = ProxyIp.get_proxy_ip()
f = open("ip.txt", "w")
for i in ip:
try:
f.write(i + "\n")
except Exception as e:
print(e)
f.close()
该爬虫抓取该站24个页面的代理ip,并且对抓取的ip进行过滤,过滤是否可用,以及是否为匿名代理。
来源:http://tor1024.com/spider/4218ZDsqRItW57236333