下载纯真ip数据,地址:http://www.cz88.net/,数据格式是这样的:
58.54.0.0 58.54.2.30 湖北省黄石市 电信 58.54.2.31 58.54.2.31 湖北省黄石市阳新县 君临网吧 58.54.2.32 58.54.2.153 湖北省黄石市 电信 58.54.2.154 58.54.2.154 湖北省黄石市阳新县 新起点网吧(文化宫) 58.54.2.155 58.54.6.90 湖北省黄石市 电信数据由空格分开分别是start,end,location,net
可以看到除了net,location是有可能相同的,这个数据大概有40多万行,我们导入数据库的话再查询是很慢的,所以我们就把相同的进行合并,并且每个数据只用一个空格隔开,代码如下:
import urllib.parse import urllib.request ''' global constants ''' START = '' END = '' LOCATION = '' NET = '' LINES = [] ''' remove the invalid space ''' def remove_invalid_space(line): return line.split() ''' format one line ''' def format_one_line(line): if len(line) == 6: net = line[-3] + line[-2] + line[-1] elif len(line) == 5: net = line[-2] + line[-1] else: net = line[-1] line_format = [line[0], line[1] , line[2], net] return line_format ''' merge ''' def line_merge(line): location = line[2] net = line[3] line_format = [line[0], line[1], location, net] global LOCATION global NET global START global END if is_same_line(LOCATION, location): line_format_over_write = START + ' ' + str(line_format[1]) + ' ' + LOCATION + ' ' + NET over_write_array(line_format_over_write) else: write_to_array(str(line_format[0]) + ' ' + str(line_format[1]) + ' ' + str(line_format[2]) + ' ' + str(line_format[3])) START = line[0] END = line[1] LOCATION = location NET = net ''' write to the global param ''' def write_to_array(line): global LINES LINES.append(line) ''' over write to the global param ''' def over_write_array(line): global LINES del LINES[-1] LINES.append(line) ''' is the same line ''' def is_same_line(last_location, current_location): if last_location == current_location: return True else: last_location_cut = cut_the_location(last_location) current_location_cut = cut_the_location(current_location) if last_location_cut == current_location_cut: return True else: return False ''' cut the location ''' def cut_the_location(location): index_city = location.find('市') if index_city != -1: return location[0:index_city+1] else: return location ''' write to tmp file ''' def write_to_tmp_file(lines): try: file = open('ip_tmp.txt', 'a') for line in lines: file.write(line + '\n') except FileNotFoundError: print('file not found') finally: if 'file' in locals(): file.close() def format_ip_file(path): try: file = open(path) for line in file: # main logic line_merge(format_one_line(remove_invalid_space(line))) except FileNotFoundError: print('file not found') finally: if 'file' in locals(): file.close() print('start format') format_ip_file('D:\workspace\Python\ip\ip.txt') print('end format') print('start write') write_to_tmp_file(LINES) print('end write', end = '')合并之后的数据格式:
0.0.0.0 0.255.255.255 IANA保留地址 CZ88.NET 1.0.0.0 1.0.0.255 澳大利亚 CZ88.NET 1.0.1.0 1.0.3.255 福建省 电信 1.0.4.0 1.0.7.255 澳大利亚 CZ88.NET 1.0.8.0 1.0.15.255 广东省 电信 1.0.16.0 1.0.31.255 日本 Beacon服务器 1.0.32.0 1.0.63.255 广东省 电信 1.0.64.0 1.0.127.255 日本 広島県中区大手町Energia通信公司 1.0.128.0 1.0.255.255 泰国 CZ88.NET 1.1.0.0 1.1.0.255 福建省 电信 1.1.1.0 1.1.1.255 澳大利亚 亚太互联网络信息中心 1.1.2.0 1.1.7.255 福建省 电信 1.1.8.0 1.1.63.255 广东省 电信 1.1.64.0 1.1.127.255 日本 东京都新宿区歌舞伎町i2ts公司 1.1.128.0 1.1.255.255 泰国 穆达汉 1.2.0.0 1.2.1.255 福建省 电信 1.2.2.0 1.2.2.255 北京市海淀区 北龙中网(北京)科技有限责任公司 1.2.3.0 1.2.3.255 澳大利亚 CZ88.NET 1.2.4.0 1.2.4.255 北京市 中国互联网络信息中心这样是不够的,我们需要导入到数据库,就要遵循一定的格式,要把start和end转换成long型的数据,代码如下:
import socket import struct import codecs ''' global constants ''' LINES = [] ''' transform the ip string to int ''' def ip_str_to_int(ip): return socket.ntohl(struct.unpack("I", socket.inet_aton(str(ip)))[0]) ''' remove the invalid space ''' def remove_invalid_space(line): return line.split() ''' check is china ''' def is_china(location): if is_municipality_or_province(location) or is_autonomous_region(location) or is_special_administrative_region(location): return True else: return False ''' check is municipality ''' def is_municipality_or_province(location): if '市' in location or '省' in location or '中国' in location: return True else: return False ''' check is autonomous region ''' def is_autonomous_region(location): if '内蒙古' in location or '宁夏' in location or '*' in location or '*' in location or '广西' in location: return True else: return False ''' check is special administrative region ''' def is_special_administrative_region(location): if '香港' in location or '澳门' in location: return True else: return False ''' convert location to country, province, city ''' def convert_location(location): if is_china(location): # process china if '中国' in location: country = '中国' province = '' city = '' else: country = '中国' if '澳门' in location: province = '澳门' city = '澳门' elif '香港' in location: province = '香港' city = '香港' elif is_autonomous_region(location): if '内蒙古' in location: length = 3 else: length = 2 province = location[:length] index = location.find('市') if index != -1: city = location[length:index] else: city = '' elif '省' in location: index_province = location.find('省') province = location[:index_province] index_city = location.find('市') if index_city != -1: city = location[index_province+1:index_city] else: city = '' else: index_city = location.find('市') province = location[:index_city] index_region = location.find('区') if index_region != -1: city = location[index_city+1:index_region+1] else: city = '' else: # process foreign country country = location province = '' city = '' if '大学' in location: # special process country = '中国' province = location city = '' return country + ',' + province + ',' + city ''' format one line ''' def format_one_line(line): begin = ip_str_to_int(line[0]) end = ip_str_to_int(line[1]) if len(line) == 6: net = line[-3] + line[-2] + line[-1] elif len(line) == 5: net = line[-2] + line[-1] else: net = line[-1] location = convert_location(line[2]) convert_line = str(begin) + ',' + str(end) + ',' + location + ',' + net + '\n' global LINES LINES.append(convert_line) ''' write to csv file ''' def write_to_csv_file(lines): try: file = open('ip.csv', 'a', encoding = 'raw-unicode-escape') file.writelines(lines) except FileNotFoundError: print('file not found') finally: if 'file' in locals(): file.close() ''' format the ip file to which we want ''' def format_ip_file(path): try: file = open(path) for line in file: # main logic of format format_one_line(remove_invalid_space(line)) except FileNotFoundError: print('file not found') finally: if 'file' in locals(): file.close() print('start format') format_ip_file('D:\workspace\Python\ip\ip_tmp.txt') print('end format') print('start write') write_to_csv_file(LINES) print('end write', end = '')格式化之后的数据为:
0,16777215,IANA\u4fdd\u7559\u5730\u5740,,,CZ88.NET 16777216,16777471,\u6fb3\u5927\u5229\u4e9a,,,CZ88.NET 16777472,16778239,\u4e2d\u56fd,\u798f\u5efa,,\u7535\u4fe1 16778240,16779263,\u6fb3\u5927\u5229\u4e9a,,,CZ88.NET 16779264,16781311,\u4e2d\u56fd,\u5e7f\u4e1c,,\u7535\u4fe1 16781312,16785407,\u65e5\u672c,,,Beacon\u670d\u52a1\u5668 16785408,16793599,\u4e2d\u56fd,\u5e7f\u4e1c,,\u7535\u4fe1 16793600,16809983,\u65e5\u672c,,,\u5e83\u5cf6\u770c\u4e2d\u533a\u5927\u624b\u753aEnergia\u901a\u4fe1\u516c\u53f8 16809984,16842751,\u6cf0\u56fd,,,CZ88.NET 16842752,16843007,\u4e2d\u56fd,\u798f\u5efa,,\u7535\u4fe1 16843008,16843263,\u6fb3\u5927\u5229\u4e9a,,,\u4e9a\u592a\u4e92\u8054\u7f51\u7edc\u4fe1\u606f\u4e2d\u5fc3 16843264,16844799,\u4e2d\u56fd,\u798f\u5efa,,\u7535\u4fe1 16844800,16859135,\u4e2d\u56fd,\u5e7f\u4e1c,,\u7535\u4fe1中文转换成unicode码可以在导入数据库的时候省下很多容量
最后是保存在一个csv文件中的,可以方便得导入到数据库,导入到数据库的脚本:
windows下:
load data infile 'D:\\ip.csv' into table `ip` fields terminated by ',' optionally enclosed by '' escaped by '' lines terminated by '\r\n';linux下:
load data infile '/usr/share/ae/ip.csv' into table `ip` fields terminated by ',' optionally enclosed by '' escaped by '' lines terminated by '\r\n';
导入了之后就可以根据ip查询位置信息了:
大概形式是:
SELECT * FROM ip_mem WHERE end > 16844803 LIMIT 1以为是begin和end是按照顺序的,所以可以简单得这么写,最好数据库引擎设置memtable,这样查询速度快
还有个问题是一般我们都是根据ip:172.1.25.2.1这样的形式来获取位置信息,所以我们需要进行一层转换,把ip转成long型的,java代码如下:
private long transform(String ip) { long result = 0; try { String[] split = ip.split("\\."); if (split.length != 4) return result; result |= (Long.parseLong(split[3]) & 0xFF); result |= ((Long.parseLong(split[2]) << 8) & 0xFF00); result |= ((Long.parseLong(split[1]) << 16) & 0xFF0000); result |= ((Long.parseLong(split[0]) << 24) & 0xFF000000); } catch (Exception e) { e.printStackTrace(); } return result; }