准备把以前写过的小爬虫慢慢贴出来,抱歉还没怎么去改~
最后可用的IP保存在运行目录下的ip_ues.txt中。
#encoding:utf8 #author : buracag_mc import urllib2, urllib import BeautifulSoup import re User_Agent = 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:43.0) Gecko/20100101 Firefox/43.0' header = {} header['User-Agent'] = User_Agent url = 'http://www.xicidaili.com/nn/1' req = urllib2.Request(url,headers=header) res = urllib2.urlopen(req).read() soup = BeautifulSoup.BeautifulSoup(res) ips = soup.findAll('tr') f = open('proxy.txt',"w") for x in range(1,len(ips)): ip = ips[x] tds = ip.findAll("td") ip_temp = tds[1].contents[0] + ":" + tds[2].contents[0]+'\n' f.write(ip_temp.encode('gbk')) f.close() testf = open('proxy.txt','r') lines = testf.readlines() testf.close() proxys = [] for line in lines: line = line.strip('\n') proxy_host = "http://" + line proxy_temp = {"http": proxy_host} #print proxy_temp proxys.append(proxy_temp) url = "http://ip.chinaz.com/getip.aspx" usefulip = [] i = 0 for proxy in proxys: try: res = urllib.urlopen(url,proxies=proxy).read() print res if re.match(r'^\{ip:.*?',res): usefulip.append(proxy) print 'this ip is ok! get!' except Exception,e: print proxy print e continue i += 1 if i >5: break print usefulip f2 = open('ip_use.txt','ab+') for ip in usefulip: print type(ip) ip = ip['http'] f2.write(ip + '\n') f2.close()
文末,附上其他相关的教程: