1、检测表格中的网站是否能v6访问,是否有AAAA记录
1 #coding:utf8 2 import pycurl,xlrd,socket,xlwt 3 from io import BytesIO 4 from xlutils.copy import copy 5 import threadpool 6 import threading 7 8 9 look = threading.Lock() 10 #检测站点是否可以使用v6访问 11 def web_can_ipv6(url): 12 buffer = BytesIO() 13 c = pycurl.Curl() 14 c.setopt(pycurl.OPT_CERTINFO, 1) 15 c.setopt(pycurl.URL, "http://"+url) 16 #设置只使用v6访问 17 c.setopt(pycurl.IPRESOLVE,pycurl.IPRESOLVE_V6) 18 c.setopt(c.WRITEDATA, buffer) 19 try: 20 c.perform() 21 return c.getinfo(pycurl.HTTP_CODE) 22 except: 23 return None 24 25 #读取表格内容 26 def read_excel(): 27 web_list = [] 28 # 打开文件 29 workbook = xlrd.open_workbook(r'webs.xls') 30 # 获取所有sheet 31 sheet1 = workbook.sheet_by_name('Sheet1') 32 for row_n in range(1,sheet1.nrows): 33 row = sheet1.row_values(row_n) 34 web_list.append(row) 35 return web_list 36 37 def write_excel(row,col,text): 38 rb = xlrd.open_workbook(r'webs.xls','wb') # 打开weng.xls文件 39 wb = copy(rb) # 利用xlutils.copy下的copy函数复制 40 ws = wb.get_sheet(0) # 获取表单0 41 ws.write(row, col, text) 42 wb.save(r'webs.xls') # 保存文件 43 44 #返回站点的AAAA记录 45 def get_dns(url): 46 try: 47 results = socket.getaddrinfo(url, None) 48 for item in results: 49 if ':' in item[4][0]: 50 return item[4][0] 51 except: 52 print('%s-error'%url) 53 54 55 def main(web): 56 print('正在检测第%s个网站' % web[0]) 57 write_excel(web[0],3,get_dns(web[2])) 58 if web_can_ipv6(web[2]): 59 write_excel(web[0],4, web_can_ipv6(web[2])) 60 61 62 63 64 65 if __name__ == '__main__': 66 for web in read_excel(): 67 main(web)
2、检测表格中的网站的返回码,归属运营商,多线程
总共一万多条数据的表格,采用分段处理的方式,大大增加处理速度
1 ##coding:utf8 2 import threadpool 3 import threading 4 import xlrd,xlwt,requests,socket 5 from xlutils.copy import copy 6 from IPy import IP 7 from contextlib import closing 8 from concurrent.futures import ThreadPoolExecutor,wait 9 10 lock = threading.Lock() 11 pool = threadpool.ThreadPool(20) 12 13 #获取网站返回码 14 def get_web_code(ws,web): 15 try: 16 url = str(web[2]).strip() 17 url = 'http://'+url 18 headers = { 19 'User-Agent': 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Trident/5.0) ', 20 } 21 r = requests.head(url=url,timeout=1,headers=headers) 22 23 print('第%s行-->%s' % (web[0], str(r.status_code))) 24 25 if is_url_in_cernet(get_dns(url)): 26 write_excel(ws,int(web[0]), 4, 'CERNET') 27 elif is_url_in_cmcc(get_dns(url)): 28 write_excel(ws,int(web[0]), 4, 'CMCC') 29 elif is_url_in_cucc(get_dns(url)): 30 write_excel(ws,int(web[0]), 4, 'CUCC') 31 elif is_url_in_ctcc(get_dns(url)): 32 write_excel(ws,int(web[0]), 4, 'CTCC') 33 else: 34 write_excel(ws,int(web[0]), 4, 'UNKnow') 35 write_excel(ws,int(web[0]),3,r.status_code) 36 return r.status_code 37 except Exception as e: 38 write_excel(ws,int(web[0]),3,'ERROR') 39 print('第%s行-->%s'%(web[0],e)) 40 41 42 #读取表格数据 43 def read_excel(): 44 count=0 45 web_list = [] 46 # 打开文件 47 workbook = xlrd.open_workbook(r'webs.xls') 48 # 获取所有sheet 49 sheet1 = workbook.sheet_by_name('Sheet1') 50 for row_n in range(1,sheet1.nrows): 51 row = sheet1.row_values(row_n) 52 web_list.append(row) 53 n=100 54 new_web_list = [web_list[i:i+n] for i in range(0,len(web_list),n)] 55 return new_web_list 56 57 58 # #表格写入数据 59 # def write_excel(row,col,text): 60 # lock.acquire() 61 # try: 62 # rb = xlrd.open_workbook(r'webs.xls','wb') 63 # wb = copy(rb) # 利用xlutils.copy下的copy函数复制 64 # ws = wb.get_sheet(0) # 获取表单0 65 # ws.write(row, col, text) 66 # wb.save(r'webs.xls') # 保存文件 67 # except: 68 # pass 69 # finally: 70 # lock.release() 71 72 73 #表格写入数据 74 def write_excel(ws,row,col,text): 75 lock.acquire() 76 try: 77 ws.write(row, col, text) 78 except: 79 pass 80 finally: 81 lock.release() 82 83 84 #返回站点的A记录 85 def get_dns(url): 86 lock.acquire() 87 if 'http://' or 'https://' in url.lower(): 88 url = url.strip() 89 url = url.lower() 90 url = url.replace('http://','') 91 url = url.replace('https://', '') 92 if ':' in url: 93 url = url.split(':')[0] 94 try: 95 results = socket.getaddrinfo(url, None) 96 return results[0][4][0] 97 except: 98 return url 99 finally: 100 lock.release() 101 102 def is_url_in_cernet(url): 103 lock.acquire() 104 try: 105 with open('cernet_ip.txt','r') as rf: 106 for line in rf.readlines(): 107 if ';' in line: 108 line = line.replace(';', '') 109 if ':' in line: 110 line = line.split(':')[0] 111 ip_line = IP(line) 112 if url in ip_line: 113 return True 114 except: 115 pass 116 finally: 117 lock.release() 118 119 120 def is_url_in_cmcc(url): 121 lock.acquire() 122 try: 123 with open('CMCC_ip.txt','r') as rf: 124 for line in rf.readlines(): 125 if ';' in line: 126 line = line.replace(';','') 127 if ':' in line: 128 line = line.split(':')[0] 129 ip_line = IP(line) 130 if url in ip_line: 131 return True 132 except: 133 pass 134 finally: 135 lock.release() 136 137 def is_url_in_cucc(url): 138 lock.acquire() 139 try: 140 with open('CUCC_ip.txt','r') as rf: 141 for line in rf.readlines(): 142 if ';' in line: 143 line = line.replace(';', '') 144 if ':' in line: 145 line = line.split(':')[0] 146 ip_line = IP(line) 147 if url in ip_line: 148 return True 149 except: 150 pass 151 finally: 152 lock.release() 153 154 def is_url_in_ctcc(url): 155 lock.acquire() 156 try: 157 with open('CTCC_ip.txt','r') as rf: 158 for line in rf.readlines(): 159 if ';' in line: 160 line = line.replace(';', '') 161 if ':' in line: 162 line = line.split(':')[0] 163 ip_line = IP(line) 164 if url in ip_line: 165 return True 166 except: 167 pass 168 finally: 169 lock.release() 170 171 172 173 174 if __name__ == '__main__': 175 new_web_list = read_excel() 176 for web_list in new_web_list: 177 rb = xlrd.open_workbook(r'webs.xls','wb') 178 wb = copy(rb) # 利用xlutils.copy下的copy函数复制 179 ws = wb.get_sheet(0) # 获取表单0 180 t_list = [] 181 for web in web_list: 182 work = threading.Thread(target=get_web_code , args=(ws, web,)) 183 work.start() 184 t_list.append(work) 185 for t in t_list: 186 t.join() 187 wb.save(r'webs.xls') 188 print('-----end-----') 189 # fs = [] 190 # for web in web_list: 191 # f = pool.submit(get_web_code, web) 192 # fs.append(f) 193 # # 相遇与join,阻塞主线程 194 # wait(fs)
3、将n个表格中的m个子表的数据全部汇总到一个总表中
测试是将30+个表个汇总,耗时不到2分钟
1 # -*- coding: utf-8 -*- 2 import os,xlrd 3 from xlutils.copy import copy 4 import time,re 5 6 #获取需要的表格 7 def get_file(): 8 file_list=[] 9 for root,dirs,files in os.walk(os.getcwd()): 10 for file in files: 11 if 'excel_run.' in file: 12 pass 13 elif file=='total.xls': 14 pass 15 elif file=='~$total.xls': 16 pass 17 else: 18 if re.search(r'/*.xls/*',file): 19 file_list.append(file) 20 return file_list 21 22 23 #读取表格数据 24 def read_excel(filename,sheetindex): 25 web_list = [] 26 # 打开文件 27 workbook = xlrd.open_workbook(r'%s'%filename) 28 # 获取所有sheet 29 sheet1 = workbook.sheet_by_index(sheetindex) 30 for row_n in range(2,sheet1.nrows): 31 row = sheet1.row_values(row_n) 32 web_list.append(row) 33 return web_list 34 35 36 #表格写入数据 37 def write_excel(sheetindex,file,count): 38 rb = xlrd.open_workbook(r'total.xls','wb') 39 wb = copy(rb) # 利用xlutils.copy下的copy函数复制 40 ws = wb.get_sheet(sheetindex) # 获取表单0 41 nrow = rb.sheet_by_index(sheetindex).nrows 42 for row in read_excel(file,sheetindex): 43 count+=1 44 for index_col,col in enumerate(row): 45 ws.write(nrow+count, index_col, col) 46 wb.save(r'total.xls') # 保存文件 47 48 49 if __name__ == '__main__': 50 t1 = time.time() 51 for file in get_file(): 52 for sheetindex in [1,2,4,5,6]: 53 count=-1 54 print('正在录入%s的第%s张表的数据' % (file, sheetindex)) 55 write_excel(sheetindex,file,count) 56 57 t2=time.time() 58 print('共耗时%s秒'%(t2-t1)) 59 print('---end---')