本文实例讲述了Python实现的大数据分析操作系统日志功能。分享给大家供大家参考,具体如下:
一 代码
1、大文件切分
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
|
import os
import os.path
import time
def FileSplit(sourceFile, targetFolder):
if not os.path.isfile(sourceFile):
print (sourceFile, ' does not exist.' )
return
if not os.path.isdir(targetFolder):
os.mkdir(targetFolder)
tempData = []
number = 1000
fileNum = 1
linesRead = 0
with open (sourceFile, 'r' ) as srcFile:
dataLine = srcFile.readline().strip()
while dataLine:
for i in range (number):
tempData.append(dataLine)
dataLine = srcFile.readline()
if not dataLine:
break
desFile = os.path.join(targetFolder, sourceFile[ 0 : - 4 ] + str (fileNum) + '.txt' )
with open (desFile, 'a+' ) as f:
f.writelines(tempData)
tempData = []
fileNum = fileNum + 1
if __name__ = = '__main__' :
#sourceFile = input('Input the source file to split:')
#targetFolder = input('Input the target folder you want to place the split files:')
sourceFile = 'test.txt'
targetFolder = 'test'
FileSplit(sourceFile, targetFolder)
|
2、Mapper代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
|
import os
import re
import threading
import time
def Map (sourceFile):
if not os.path.exists(sourceFile):
print (sourceFile, ' does not exist.' )
return
pattern = re. compile (r '[0-9]{1,2}/[0-9]{1,2}/[0-9]{4}' )
result = {}
with open (sourceFile, 'r' ) as srcFile:
for dataLine in srcFile:
r = pattern.findall(dataLine)
if r:
t = result.get(r[ 0 ], 0 )
t + = 1
result[r[ 0 ]] = t
desFile = sourceFile[ 0 : - 4 ] + '_map.txt'
with open (desFile, 'a+' ) as fp:
for k, v in result.items():
fp.write(k + ':' + str (v) + '\n' )
if __name__ = = '__main__' :
desFolder = 'test'
files = os.listdir(desFolder)
#如果不使用多线程,可以直接这样写
'''for f in files:
Map(desFolder + '\\' + f)'''
#使用多线程
def Main(i):
Map (desFolder + '\\' + files[i])
fileNumber = len (files)
for i in range (fileNumber):
t = threading.Thread(target = Main, args = (i,))
t.start()
|
3.Reducer代码
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
|
import os
def Reduce (sourceFolder, targetFile):
if not os.path.isdir(sourceFolder):
print (sourceFolder, ' does not exist.' )
return
result = {}
#Deal only with the mapped files
allFiles = [sourceFolder + '\\'+f for f in os.listdir(sourceFolder) if f.endswith(' _map.txt')]
for f in allFiles:
with open (f, 'r' ) as fp:
for line in fp:
line = line.strip()
if not line:
continue
position = line.index( ':' )
key = line[ 0 :position]
value = int (line[position + 1 :])
result[key] = result.get(key, 0 ) + value
with open (targetFile, 'w' ) as fp:
for k,v in result.items():
fp.write(k + ':' + str (v) + '\n' )
if __name__ = = '__main__' :
Reduce ( 'test' , 'test\\result.txt' )
|
二 运行结果
依次运行上面3个程序,得到最终结果:
07/10/2013:4634
07/16/2013:51
08/15/2013:3958
07/11/2013:1
10/09/2013:733
12/11/2013:564
02/12/2014:4102
05/14/2014:737
希望本文所述对大家Python程序设计有所帮助。
原文链接:https://blog.csdn.net/chengqiuming/article/details/78601136