利用scrapy框架进行爬取免费代理网站所提供的代理ip
# -*- coding: utf-8 -*-
import random
from time import sleep
import redis
import scrapy
from redis_con import redis_save
class IpPoolSpider():
name = 'ip_pool'
allowed_domains = ['/gaoni/']
start_urls = ['/gaoni/']
def parse(self, response):
ips = ('//table[contains(@class,"fl-table")]/tbody/tr/td[1]/text()').getall()
redis_save(ips)
next_url = "" + ('//ul[contains(@class,"pagination")]/li[last()]/a/@href').get()
print(next_url)
sleep((1,10))
yield (next_url,callback=,dont_filter=True)
把爬取到的代理ip存储到redis缓存,并在每次使用之前都进行可达判断,不可达从缓存中将其删掉
from random import choice
import redis
from ip_check import check
redis_client = (host='127.0.0.1',password='123')
def redis_show():
with open('','a+') as f:
for ip in redis_client.lrange('ips',0,-1):
(ip + '\n')
print('数据展示保存在文本完成')
def redis_save(ips):
for ip in ips:
redis_client.lpush('ips',ip)
# redis_show()
def redis_check():
for ip in redis_client.lrange('ips',0,-1):
pass
def redis_delete():
redis_client.ltrim('ips',-1,0)
def redis_out():
ips = []
for ip in redis_client.lrange('ips', 0, -1):
(ip)
flag = True
while flag:
ip = choice(ips[-1:-200:-1])
result = check(ip)
print(result)
if result:
flag = False
return result
else:
redis_client.lrem('ips',0,ip)
# ip = choice(ips)
# if check(ip):
# return ip
# else:
# redis_client.lrem('ips', 0, ip)
if __name__ == '__main__':
i = 1
for ip in redis_client.lrange('ips', 0, -1):
print(i)
i += 1
print(ip)
# redis_delete()
判断ip是否可达
import requests
def check(ip):
ip = "http://" + str(ip)[2:-1]
print(ip)
try:
results = ('',proxies={"http":ip},timeout = 5)
except:
print('失败')
return False
else:
print('ok')
return ip
在下载中间件进行设置代理ip
class IPTest(MyFirstSpiderDownloaderMiddleware):
def process_request(self, request, spider):
#动态ip
['proxy'] = redis_out()