python_day06(ip代理池)

时间:2020-12-24 17:22:28
from urllib.request import Request, ProxyHandler
from urllib.request import build_opener
from bs4 import BeautifulSoup
import MySQLdb;
import redis
from urllib.request import urlopen
from lxml import etree
from lxml import etree
import re;
urlfront = "http://www.xicidaili.com"
url = "http://www.xicidaili.com/nn/1"
result = redis.Redis(host='127.0.0.1', port=,db=) # def spider_IP(url):
# 获取整个页面
def get_allcode(url):
# 设置代理IP
proxy = {'https': '110.73.0.45:8123'}
proxy_support = ProxyHandler(proxy);
opener = build_opener(proxy_support)
# 设置访问http协议头,模拟浏览器
opener.addheaders = [
('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
r = opener.open(url)
html = r.read().decode("UTF-8");
# print(html)
return str(html)
# lxml 方式 获取Ip
def find_ip(s):
# s = get_allcode(url);
selector = etree.HTML(s);
links = selector.xpath('//tr[@class="odd"]/td/text()|//tr[@class=""]/td/text()');
ip=[]
port=[]
for link in links:
# print(link)
if '-' in link:
# print()
pass
elif link.isdigit():
port.append(link)
# f.write(link + '\n');
elif '.' in link:
ip.append(link)
# f.write(link + ':');
# 用redis 的 llist存 ip
for i in range(len(ip)):
# print(ip[i]+":"+port[i])
ips=ip[i] + ":" + port[i]
result.lpush('mylist',ips)
def get_next_page(s):
selecter = etree.HTML(s);
link = selecter.xpath('//div[@class="pagination"]/a[@class="next_page"]/@href');
for i in link:
if i == None:
return None;
return urlfront + i
def get_allcode_ip(url,ip):
# 设置代理IP
try:
ip=str(ip, encoding="utf-8")# bytes与str相互转换
timeout=
proxy = {'http':ip}
proxy_support = ProxyHandler(proxy);
opener = build_opener(proxy_support)
# 设置访问http协议头,模拟浏览器
opener.addheaders = [
('User-agent', 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6')]
# 加运行超时
r = opener.open(url,None,timeout)
html = r.read().decode("UTF-8");
print('+++++++++++++++')
# 将可用Ip放到redis的useable_ip中
result.lpush('usable_ip',ip)
print(ip)
print('+++++++++++++++')
except Exception as err:
print(err)
while :
print(url)
s=get_allcode(url);
url=get_next_page(s)
print(url)
if url==None:
break
find_ip(s)
while :
ip = result.lpop('mylist')
print(ip)
if ip == None:
break
get_allcode_ip(url, ip)