获得代理ip并验证

import re
import requests
import threading
from queue import Queue
from lxml import etree


class ProxyIp:
    @classmethod
    def get_proxy_ip(cls):
        url1 = "http://www.youdaili.net/"
        href_queue = Queue()  # 到该网站找到需要抓取ip的链接
        ip_queue = Queue()  # 网站上能找到的ip
        ip_valid = []  # 有效的代理

        url1_resp = requests.get(url1)
        content = url1_resp.content.decode()
        html = etree.HTML(content)
        box = html.xpath("//div[@class='m_box2']/ul/li/a")
        for i in range(24):
            href_queue.put(box[i].get("href"))  # 将链接放到队列里

        while not href_queue.empty():  # 获取所有代理
            ip_href = href_queue.get()
            try:
                resp = requests.get(ip_href)
                content = resp.text
                html = etree.HTML(content)
                ips = html.xpath("//div[@class='cont_font']/p/span[1]")
                for ip in ips[0].itertext():
                    ip_queue.put(ip)
            except Exception as e:
                print(e)
                break

        checkth = []
        print("Starting Checking...")
        for i in range(30):  # 使用多线程进行验证是否可用
            checkth.append(CheckThreads(ip_queue, ip_valid))
        for th in checkth:
            th.start()
        for th in checkth:
            th.join()
        ip_valid = list(set(ip_valid))
        return ip_valid


class CheckThreads(threading.Thread):
    mutex = threading.Lock()

    def __init__(self, ip_queue, ip_valid):
        super(CheckThreads, self).__init__()
        self.ip_queue = ip_queue
        self.ip_valid = ip_valid

    def run(self):
        while not self.ip_queue.empty():
            try:
                ip_re = re.compile(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}\:\d{2,5})')
                ip = self.ip_queue.get()
                ip_proxy = ip_re.search(ip).group()
                proxies = {"http": ip_proxy}
                headers = {
                    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                    "Accept-Encoding": "gzip, deflate",
                    "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
                    "Cache-Control": "no-cache",
                    "Connection": "keep-alive",
                    "Host": "www.1356789.com",
                    "Pragma": "no-cache",
                    "Referer": "http://m.ip138.com/",
                }
                resp = requests.get(url="http://www.1356789.com/", proxies=proxies, headers=headers, timeout=10)
                if resp.status_code == 200:
                    page = resp.text
                    ip_url = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', page).group()
                    ip_now = re.search(r'(\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3})', ip_proxy).group()
                    if ip_url == ip_now:  # 判断是否是匿名代理
                        self.ip_valid.append(ip_proxy + " 匿名代理")
                        print(ip_proxy + " 匿名代理")
                    else:
                        self.ip_valid.append(ip_proxy + " 透明代理")
                        print(ip_proxy + " 透明代理")
            except Exception:
                print(ip_proxy + " 该代理ip无效或响应过慢！")


if __name__ == "__main__":
    ip = ProxyIp.get_proxy_ip()
    f = open("ip.txt", "w")
    for i in ip:
        try:
            f.write(i + "\n")
        except Exception as e:
            print(e)
    f.close()
该爬虫抓取该站24个页面的代理ip,并且对抓取的ip进行过滤，过滤是否可用，以及是否为匿名代理。
获得代理ip并验证
来源：http://tor1024.com/spider/4218ZDsqRItW57236333
秒客网

获得代理ip并验证

相关文章