buffer = 8192 来自网络,但不知道是怎么实践出来的。
from hashlib import md5 import time import os def calMD5(str): m = md5() m.update(str) return m.hexdigest() def calMD5ForFile(file): statinfo = os.stat(file) if int(statinfo.st_size)/(1024*1024) >= 1000 : print "File size > 1000, move to big file..." return calMD5ForBigFile(file) m = md5() f = open(file, 'rb') m.update(f.read()) f.close() return m.hexdigest() def calMD5ForFolder(dir,MD5File): outfile = open(MD5File,'w') for root, subdirs, files in os.walk(dir): for file in files: filefullpath = os.path.join(root, file) """print filefullpath""" filerelpath = os.path.relpath(filefullpath, dir) md5 = calMD5ForFile(filefullpath) outfile.write(filerelpath+' '+md5+"\n") outfile.close() def calMD5ForBigFile(file): m = md5() f = open(file, 'rb') buffer = 8192 # why is 8192 | 8192 is fast than 2048 while 1: chunk = f.read(buffer) if not chunk : break m.update(chunk) f.close() return m.hexdigest() if __name__ == "__main__": #print calMD5("Hello World!") t = time.time() print calMD5ForFile("E:\\OS\\ubuntu-11.04-desktop-i386.iso") print time.time() - t t = time.time() print calMD5ForBigFile("E:\\OS\\ubuntu-11.04-desktop-i386.iso") print time.time() - t,"\n" t = time.time() print calMD5ForFile("E:\\OS\\ubuntu-12.04-desktop-amd64.iso") print time.time() - t t = time.time() print calMD5ForBigFile("E:\\OS\\ubuntu-12.04-desktop-amd64.iso") print time.time() - t,"\n" t = time.time() print calMD5ForFile("D:\\Virtual Machines\\Ubuntu 64-bit\\Ubuntu 64-bit-s001.vmdk") print time.time() - t t = time.time() print calMD5ForBigFile("D:\\Virtual Machines\\Ubuntu 64-bit\\Ubuntu 64-bit-s001.vmdk") print time.time() - t,"\n" #output #8b1085bed498b82ef1485ef19074c281 #2.57500004768 #8b1085bed498b82ef1485ef19074c281 #3.34100008011 # #128f0c16f4734c420b0185a492d92e52 #2.632999897 #128f0c16f4734c420b0185a492d92e52 #3.39100003242 # #File size > 1000, move to big file... #ec1fa4dc1b32569e9da7b4744548a9ef #5.40100002289 #ec1fa4dc1b32569e9da7b4744548a9ef #5.42100000381
PS: 纪念下我直接计算3G+文件时的内存使用率