一、简介
- 爬虫中为什么需要使用代理
一些网站会有相应的反爬虫措施,例如很多网站会检测某一段时间某个IP的访问次数,如果访问频率太快以至于看起来不像正常访客,它可能就会禁止这个IP的访问。所以我们需要设置一些代理IP,每隔一段时间换一个代理IP,就算IP被禁止,依然可以换个IP继续爬取。
- 代理的分类:
正向代理:代理客户端获取数据。正向代理是为了保护客户端防止被追究责任。
反向代理:代理服务器提供数据。反向代理是为了保护服务器或负责负载均衡。
- 免费代理ip提供网站
http://www.goubanjia.com/
西刺代理
快代理
- 匿名度:
- 透明:知道是代理ip,也会知道你的真实ip
- 匿名:知道是代理ip,不会知道你的真实ip
- 高匿:不知道是代理ip,不会知道你的真实ip
- 类型:
- http:只能请求http开头的url
- https:只能请求https开头的url
示例
import requests headers = { \'User-Agent\': \'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36\' } url = \'https://www.baidu.com/s?wd=ip\' # 不同的代理IP,代理ip的类型必须和请求url的协议头保持一致 proxy_list = [ {"http": "112.115.57.20:3128"}, {\'http\': \'121.41.171.223:3128\'} ] # 随机获取代理IP proxy = random.choice(proxy_list) page_text = requests.get(url=url,headers=headers,proxies=proxy).text with open(\'ip.html\',\'w\',encoding=\'utf-8\') as fp: fp.write(page_text) print(\'over!\')
二、IP池
1、免费IP池
从西刺代理上面爬取IP,迭代测试能否使用,建立一个自己的代理IP池,随时更新用来抓取网站数据
import requests from lxml import etree import time import random from fake_useragent import UserAgent class GetProxyIP(object): def __init__(self): self.url = \'https://www.xicidaili.com/nn/\' self.proxies = { \'http\': \'http://163.204.247.219:9999\', \'https\': \'http://163.204.247.219:9999\'} # 随机生成User-Agent def get_random_ua(self): ua = UserAgent() # 创建User-Agent对象 useragent = ua.random return useragent # 从西刺代理网站上获取随机的代理IP def get_ip_file(self, url): headers = {\'User-Agent\': self.get_random_ua()} html = requests.get(url=url, proxies=self.proxies, headers=headers, timeout=5).content.decode(\'utf-8\', \'ignore\') parse_html = etree.HTML(html) tr_list = parse_html.xpath(\'//tr\') # 基准xpath,匹配每个代理IP的节点对象列表 for tr in tr_list[1:]: ip = tr.xpath(\'./td[2]/text()\')[0] port = tr.xpath(\'./td[3]/text()\')[0] self.test_proxy_ip(ip, port) # 测试ip:port是否可用 # 测试抓取的代理IP是否可用 def test_proxy_ip(self, ip, port): proxies = { \'http\': \'http://{}:{}\'.format(ip, port), \'https\': \'https://{}:{}\'.format(ip, port), } test_url = \'http://www.baidu.com/\' try: res = requests.get(url=test_url, proxies=proxies, timeout=8) if res.status_code == 200: print(ip, ":", port, \'Success\') with open(\'proxies.txt\', \'a\') as f: f.write(ip + \':\' + port + \'\n\') except Exception as e: print(ip, port, \'Failed\') def main(self): for i in range(1, 1001): url = self.url.format(i) self.get_ip_file(url) time.sleep(random.randint(5, 10)) if __name__ == \'__main__\': spider = GetProxyIP() spider.main()
从IP池中取IP,也就是在爬虫程序中从文件随机获取代理IP
import random import requests class BaiduSpider(object): def __init__(self): self.url = \'http://www.baidu.com/\' self.headers = {\'User-Agent\': \'Mozilla/5.0\'} self.flag = 1 def get_proxies(self): with open(\'proxies.txt\', \'r\') as f: result = f.readlines() # 读取所有行并返回列表 proxy_ip = random.choice(result)[:-1] # 获取了所有代理IP L = proxy_ip.split(\':\') proxy_ip = { \'http\': \'http://{}:{}\'.format(L[0], L[1]), \'https\': \'https://{}:{}\'.format(L[0], L[1]) } return proxy_ip def get_html(self): proxies = self.get_proxies() if self.flag <= 3: try: html = requests.get(url=self.url, proxies=proxies, headers=self.headers, timeout=5).text print(html) except Exception as e: print(\'Retry\') self.flag += 1 self.get_html() if __name__ == \'__main__\': spider = BaiduSpider() spider.get_html()
2.收费代理API
写一个获取收费开放API代理的接口
import requests from fake_useragent import UserAgent ua = UserAgent() # 创建User-Agent对象 useragent = ua.random headers = {\'User-Agent\': useragent} def ip_test(ip): url = \'http://www.baidu.com/\' ip_port = ip.split(\':\') proxies = { \'http\': \'http://{}:{}\'.format(ip_port[0], ip_port[1]), \'https\': \'https://{}:{}\'.format(ip_port[0], ip_port[1]), } res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5) if res.status_code == 200: return True else: return False # 提取代理IP def get_ip_list(): # 快代理:https://www.kuaidaili.com/doc/product/dps/ api_url = \'http://dev.kdlapi.com/api/getproxy/?orderid=946562662041898&num=100&protocol=1&method=2&an_an=1&an_ha=1&sep=2\' html = requests.get(api_url).content.decode(\'utf-8\', \'ignore\') ip_port_list = html.split(\'\n\') for ip in ip_port_list: with open(\'proxy_ip.txt\', \'a\') as f: if ip_test(ip): f.write(ip + \'\n\') if __name__ == \'__main__\': get_ip_list()
3.私密代理
1、语法结构
用户名和密码会在给API_URL的时候给。不是自己的账号和账号密码。
proxies = { \'协议\':\'协议://用户名:密码@IP:端口号\' } proxies = { \'http\':\'http://用户名:密码@IP:端口号\', \'https\':\'https://用户名:密码@IP:端口号\' } proxies = { \'http\': \'http://309435365:szayclhp@106.75.71.140:16816\', \'https\':\'https://309435365:szayclhp@106.75.71.140:16816\', }
# 获取开放代理的接口 import requests from fake_useragent import UserAgent ua = UserAgent() # 创建User-Agent对象 useragent = ua.random headers = {\'User-Agent\': useragent} def ip_test(ip): url = \'https://blog.csdn.net/qq_34218078/article/details/90901602/\' ip_port = ip.split(\':\') proxies = { \'http\': \'http://1786088386:b95djiha@{}:{}\'.format(ip_port[0], ip_port[1]), \'https\': \'http://1786088386:b95djiha@{}:{}\'.format(ip_port[0], ip_port[1]), } res = requests.get(url=url, headers=headers, proxies=proxies, timeout=5) if res.status_code == 200: print("OK") return True else: print(res.status_code) print("错误") return False # 提取代理IP def get_ip_list(): # 快代理:https://www.kuaidaili.com/doc/product/dps/ api_url = \'http://dps.kdlapi.com/api/getdps/?orderid=986603271748760&num=1000&signature=z4a5b2rpt062iejd6h7wvox16si0f7ct&pt=1&sep=2\' html = requests.get(api_url).content.decode(\'utf-8\', \'ignore\') ip_port_list = html.split(\'\n\') for ip in ip_port_list: with open(\'proxy_ip.txt\', \'a\') as f: if ip_test(ip): f.write(ip + \'\n\') if __name__ == \'__main__\': get_ip_list()
思路:
- 写一个类;
- get_ip() requests请求接口,得到ip和port;
- test_ip() 请求某一网站,根据状态码或in判断是否有某一内容来判断此ip是否可用,返回Ture和False即可;
- save_ip()测试成功后保存;