ip代理池(练手)

时间:2025-04-10 07:43:24

利用scrapy框架进行爬取免费代理网站所提供的代理ip

# -*- coding: utf-8 -*-
import random
from time import sleep

import redis
import scrapy

from redis_con import redis_save


class IpPoolSpider():
    name = 'ip_pool'
    allowed_domains = ['/gaoni/']
    start_urls = ['/gaoni/']

    def parse(self, response):
        ips = ('//table[contains(@class,"fl-table")]/tbody/tr/td[1]/text()').getall()

        redis_save(ips)
        next_url = "" + ('//ul[contains(@class,"pagination")]/li[last()]/a/@href').get()
        print(next_url)
        sleep((1,10))
        yield (next_url,callback=,dont_filter=True)

把爬取到的代理ip存储到redis缓存,并在每次使用之前都进行可达判断,不可达从缓存中将其删掉

from random import choice

import redis

from ip_check import check

redis_client = (host='127.0.0.1',password='123')



def redis_show():
    with open('','a+') as f:

        for ip in redis_client.lrange('ips',0,-1):
            (ip + '\n')

    print('数据展示保存在文本完成')



def redis_save(ips):
    for ip in ips:
        redis_client.lpush('ips',ip)
    # redis_show()




def redis_check():
    for ip in redis_client.lrange('ips',0,-1):
        pass


def redis_delete():
    redis_client.ltrim('ips',-1,0)


def redis_out():
    ips = []
    for ip in redis_client.lrange('ips', 0, -1):


        (ip)

    flag = True
    while flag:
        ip = choice(ips[-1:-200:-1])
        result = check(ip)
        print(result)
        if result:
            flag = False
            return result
        else:
            redis_client.lrem('ips',0,ip)
    # ip = choice(ips)
    # if check(ip):
    #     return ip
    # else:
    #     redis_client.lrem('ips', 0, ip)
if __name__ == '__main__':
    i = 1
    for ip in redis_client.lrange('ips', 0, -1):
        print(i)
        i += 1
        print(ip)
    # redis_delete()


判断ip是否可达

import requests


def check(ip):
    ip = "http://" + str(ip)[2:-1]
    print(ip)

    try:
        results = ('',proxies={"http":ip},timeout = 5)
    except:
        print('失败')
        return False
    else:
        print('ok')

        return ip

在下载中间件进行设置代理ip

class IPTest(MyFirstSpiderDownloaderMiddleware):

    def process_request(self, request, spider):
        #动态ip
        ['proxy'] = redis_out()