
前提:
了解正则基本语法
import re
with open('top10_xiaozhuang_net.log','r') as f1: #读取日志文件 subject=f1.readlines()
with open('slice_log.log','w') as f2: #将切割结果存储到slice_log.log
for line in subject:
#line:
2019-04-15 00:00:00 192.168.254.253 info LinkProof: 14/04/2019 22:51:53 14/04/2019 22:52:48 114. 80.179.132 210. 29.144. 1 211.65.207.189 UDP 17224 53 0.0.0.0 OTHER 84,
14/04/2019 22:51:53 14/04/2019 22:52:48 120.221.144.117 210. 29.144. 1 211.65.207.189 UDP 38883 53 0.0.0.0 OTHER 80,
14/04/2019 22:51:53 14/04/2019 22:52:48 112. 47. 12.154 210. 29.144. 1 211.65.207.189 UDP 34323 53 0.0.0.0 OTHER 76,
#将log切块,使得结果成为结构统一的块
result = re.split(
#用问号和"...LinkProof"和","来切
r""".*LinkProof:\s+|\,
""",
line.strip('\n'), 0, re.VERBOSE) #result : ['', '14/04/2019 22:51:53 14/04/2019 22:52:48 120.221.145. 4 210. 29.144. 1 211.65.207.189 UDP 64777 53 0.0.0.0 OTHER 305','...',''] lenth = 9
#用切片去除头尾的空
for block in result[1:8]:
f2.write(block+'\n')
#将日期和时间分开取,正则表达式更简单,效率会更高
date1 = r"\S*" #反取,取不为空格的
time1 = r"\S*"
date2 = r"\S*"
time2 = r"\S*"
# time1 = r"\d{2}/\d{2}/\d{4}\s+(?:\d+\:){2}\d{2}" #取IP,因为存在IP里存在空格,所以用相对复杂的正则保证每次取到
ip1 = r"(?:\d{1,3}\.\s*){3}\d{1,3}"
ip2 = r"(?:\d{1,3}\.\s*){3}\d{1,3}"
ip3 = r"(?:\d{1,3}\.\s*){3}\d{1,3}"
protocal = r"\w{3}"
sizelike = r"\d*"
portlike = r"\d*"
ip4 = r"\S*"
type = r"\w*"
num = r"\d*"
#正则预编译
log_pattern = re.compile(r"(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)\s+(%s)" \
%(date1,time1,date2,time2,ip1,ip2,ip3,protocal,sizelike,portlike,ip4,type,num),re.VERBOSE)
l = []
with open('slice_log.log','r') as f2: #
lines = f2.readlines() for line in lines:
dic = {}
line_matchs = log_pattern.match(line)
if line_matchs != None:
all_groups = line_matchs.groups()
dic["date1"] = all_groups[0]+" "+all_groups[1]
dic["date2"] = all_groups[2]+" "+all_groups[3] #去掉IP里的空格
dic["ip1"] = all_groups[4].replace(" ","")
dic["ip2"] = all_groups[5].replace(" ","")
dic["ip3"] = all_groups[6].replace(" ","") dic["protocal"] = all_groups[7]
dic["sizelike"] = all_groups[8]
dic["portlike"] = all_groups[9]
dic["ip4"] = all_groups[10].replace(" ", "")
dic["type"] = all_groups[11]
dic["num"] = all_groups[12] l.append(dic)
# print((all_groups)) for item in l:
print(item)
得到的结果:

可再参考文章: