最近运行使用时间2017.12.01
运行结果正常
运行环境python.27
#coding:utf8 from bs4 import BeautifulSoup import urllib2 import sys reload(sys) import telnetlib def getProxyList(targeturl="http://www.xicidaili.com/nn/"): # 创建变量查看爬取IP数量 countNum=0 # 创建打开txt文件 proxyFile=open('ip_port.txt','a') # 设置报头 requestHeader={"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"} # 爬取前五页 for page in range(1,5): url=targeturl+str(page) print(url) request=urllib2.Request(url,headers=requestHeader) html_doc=urllib2.urlopen(request).read() # 使用bs4匹配 soup=BeautifulSoup(html_doc,"html.parser") trs=soup.find('table',id='ip_list').find_all('tr') for tr in trs[1:]: tds=tr.find_all('td') # 国家 if tds[0].find('img') is None: nation='未知' locate='未知' else: nation=tds[0].find('img')['alt'].strip() locate=tds[4].text.strip() ip=tds[1].text.strip() # print(ip) port=tds[2].text.strip() # print(port) anony=tds[4].text.strip() # print(anony) protocol=tds[5].text.strip() # print(protocol) speed=tds[8].text.strip() time=tds[9].text.strip() proxyFile.write('%s|%s|%s|%s|%s|%s|%s|%s\n' % (nation, ip, port, locate, anony, protocol, speed, time)) countNum += 1 proxyFile.close() return countNum # 验证代理有效性 def verifyProxyList(): inFile = open('ip_port.txt', 'rb') outFile = open('verified.txt', 'w') while True: # 从ip_port.txt文件中把之前爬取的ip和端口取出来验证 ll=inFile.readline().strip() if len(ll) == 0: break line=ll.strip().split('|') ip=line[1] port=line[2] # 判断ip是否可用,有效则写入到verified.txt文件中 try: telnetlib.Telnet(ip, port, timeout=2) outFile.write(ll+"\n") print 'success' + ip + port except: print 'connect failed' # 关闭文件 inFile.close() outFile.close() if __name__=='__main__': proxynum=getProxyList("http://www.xicidaili.com/nn/") print(u"国内高匿:"+str(proxynum)) # proxynum = getProxyList("http://www.xicidaili.com/nt/") # print u"国内透明:" + str(proxynum) # proxynum = getProxyList("http://www.xicidaili.com/wn/") # print u"国外高匿:" + str(proxynum) # proxynum = getProxyList("http://www.xicidaili.com/wt/") # print u"国外透明:" + str(proxynum) verify=verifyProxyList() print("降龙十八掌,打完收工")