解决爬虫最快的方法就是用代理,不多说,直接上代码:
import requests
import parsel
import json
def getHTMLText(url):
try:
headers = {
"User-Agent": "Mozilla/5.0(Windows NT 6.1;WOW64) AppleWebKit/537.36(KABUL, like Gecko) "
"Chrome/86.0.4240.198Safari/537.36 "
}
r = requests.get(url=url, headers=headers, timeout=30)
r.raise_for_status() # 如果状态不是200,产生HTTPError异常
# print(r.status_code)
r.encoding = r.apparent_encoding
return r.text
except:
return "产生异常"
def data(page_text):
proxies_list_all = []
proxies_list = []
selector = parsel.Selector(page_text)
trs = selector.css('#list > table > tbody > tr') # css摄取
# print(trs)
for tr in trs:
ip_num = tr.css('td:nth-child(1)::text').get() # css摄取
ip_port = tr.css('td:nth-child(2)::text').get()
# print(ip_num)
# print(ip_port)
proxies_dict = {
"https": "https://" + ip_num + ':' + ip_port,
}
# print(proxies_dict)
proxies_list_all.append(proxies_dict)
# 检测IP代理是否可用
try:
response_1 = requests.get(url='https://www.baidu.com/', proxies=proxies_dict, timeout=5)
if response_1.status_code == 200:
print('代理可以使用', proxies_dict)
proxies_list.append(proxies_dict)
# 保存代理
with open("代理.txt", mode='a', encoding='utf-8') as f:
f.write(json.dumps(proxies_dict))
f.write('\n')
except:
print("当前代理不可用", proxies_dict)
# print("一共获取代理:", len(proxies_list_all))
# print("可以使用代理:", len(proxies_list))
if __name__ == '__main__':
for page in range(11, 22):
url = "https://www.kuaidaili.com/free/inha/" + str(page)
txt = getHTMLText(url)
data(txt)
可用代理保存在“代理.txt”文件中。