对英文文本的字母进行huffman编码,heapq优先队列构建huffman树
python huffman.py source.txt result.txt
1 import sys
2 import heapq
3 import collections
4
5 class Node(object):
6 def __init__(self,value = None,count = 1,left = None,right = None, code = ''):
7 self.value = value
8 self.count = count
9 self.left = left
10 self.right = right
11 self.code = code
12
13 def isleaf(self):
14 if self.left != None:
15 return False
16 elif self.right != None:
17 return False
18 else:
19 return True
20
21 def __repr__(self):
22 return "Node(%r,%r)"%(self.value,self.count)
23 #for sort or priority queue
24 def __lt__(self,other):
25 return self.count < other.count
26 #for operator +
27 def __add__(self,other):
28 self.code = 0
29 other.code = 1
30 # only leaf node value is need
31 return Node(self.value+other.value,self.count+other.count,self,other)
32
33 def getTreeRoot(text):
34 Counter = collections.Counter(text)
35 head = [Node(k,v) for (k,v) in Counter.items()]
36 heapq.heapify(head)
37
38 while len(head) >= 2:
39 heapq.heappush(head, heapq.heappop(head) + heapq.heappop(head))
40
41 root = head[0]
42 return root
43
44 def huffman(root, prefix = []):
45 code = {}
46 if root is None:
47 return code
48 prefix = prefix + [root.code]
49 if root.isleaf():
50 code[root.value] = prefix
51 else:
52 code.update(huffman(root.left,prefix))
53 code.update(huffman(root.right,prefix))
54 return code
55
56 def gethuffmantext(text):
57 root = getTreeRoot(text)
58 codebook = huffman(root)
59 for k,v in codebook.items():
60 newv = "".join(str(char) for char in v)
61 codebook[k] = newv
62 print (codebook)
63 print("The original text size is {}".format(len(text) * 8))
64 huffmantext = []
65 lenhuffman = 0
66 for char in text:
67 lenhuffman += len(codebook[char])
68 huffmantext.append(codebook[char])
69 print ("The huffman code text size is {}".format(lenhuffman))
70
71 return huffmantext, codebook, lenhuffman
72
73 def gettextfromhuffmancode(huffmantext,codebook):
74
75 reversecodebook = {value:key for (key,value) in codebook.items()}
76
77 text = []
78 for huffmancode in huffmantext:
79 text.append(reversecodebook[huffmancode])
80
81 return text
82
83
84 if __name__ == "__main__":
85 text = open(sys.argv[1],"rb").read()
86 newfile = sys.argv[2]
87 #huffmantext, codebook, lenhuffman = gethuffmantext("hello world")
88 huffmantext, codebook, lenhuffman = gethuffmantext(text)
89 text1 = gettextfromhuffmancode(huffmantext,codebook)
90 text1 = "".join(str(v) for v in text1)
91
92 lenoftext = len(text)*8.0
93 lenofhuffman = lenhuffman
94
95 print ("The compression ratio is %lf \n" % (1.0 * lenhuffman / lenoftext ))
96
97 fp = open(newfile,"wb")
98 fp.write(text1)
99 fp.close()