Huffman编码:根据词频构建Huffman树,实现对文本的前缀编码。
1、统计文本中每个字符出现的次数,放入优先队列中,构建一棵空的二叉树;
2、取出频率最小的两个字符a、b,字符a、b的频率分别作为此二叉树的左右结点,左结点的编号为1,右结点的编号为0,其频率之和(fa+ fb)作为该二叉树的父亲节点,放入优先队列,并将fa 、fb从优先队列中除去;
3、重复第二步操作,直至优先队列中只剩下一个数,即为此Huffman树的根节点。
4、从根节点到每个叶节点(文本中出现的字符)的“路径”,即0、1序列串就是该字符的前缀编码。
注:这种编码方式保证了,任意一个字符的编码都不会是其他字符编码的前缀,这样在解码过程中就不会混淆。
数据结构:
为方便记录每个字符的前缀编码,在构建Huffman树过程中,需要保存每一个结点的父亲节点、左右儿子结点、叶节点对应字符、当前结点频率。
压缩过程:
1、首先构建Huffman树,获得每个字符对应的前缀编码;
2、将字符及其对应的前缀编码等压缩信息写入压缩文档中,便于解码;
3、扫描文本,将文本中的字符转换成0、1串,每八位,即一个字节对应的字符存储到压缩文件中。
注:如果最后存储的0、1串不足八位,则在末尾补0,然后将补的位数信息写入压缩文件中。
解压过程:
1、读取压缩信息;
2、扫描压缩文本,将每个字符转化成0、1串,匹配字符的前缀编码,转化成原始文件。
注:解码时需删除之前补充的位数
一点体会:
1、总在循环内,动态申请数组,会导致程序崩溃;
2、千万不要在循环内,每次都调用strlen函数,我表示没能深入了解此函数内涵,导致程序慢的要死;
3、原文本越大,压缩率越高,对于一个2M的文件,压缩率大约在45%左右;
4、感谢领导倾情指点,比赛加油!
压缩过程程序源码:
#include <iostream>
#include <fstream>
#include <cstring>
#include <queue>
#include <algorithm>
#include <time.h>
using namespace std;
typedef long long LL;
const int FILE_LENGTH = 1000;
//maximal bytes which is read from file each time
const long long MAX_MEMORY = 3 * 1024 * 1024;
//number of kinds of character
const int KIND_OF_CHARACTER = 260;
//the maximal length of Huffman code
const int HUFFMAN_CODE_LENGTH = 1000;
//the position of the size of original file in compressed file
const int OFFSET = 20;
//store compress file in 8 bits
const int nBits = 8;
struct Node {
char c; // character
int parent, lChild, rChild;//children node
int iNode; //the serial number of node
LL number; //number of corresponding character
friend bool operator < (Node a, Node b) {
return a.number > b.number;
}
}node[KIND_OF_CHARACTER];
char HuffmanCode[KIND_OF_CHARACTER][HUFFMAN_CODE_LENGTH];
//LL characters[KIND_OF_CHARACTER];
void CountKinds(); //for test
int BuildHuffmanTree();
void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode);
void BitToInt(ofstream &outPut, char *HTstr, LL len);
int main() {
//scan the file to count frequency of each character.
char filePath[FILE_LENGTH] = "graph.txt"; //"Aesop_Fables.txt"; "graph.txt"; "1.txt";
char compressFilePath[FILE_LENGTH] = "result.txt";
ifstream readIn;
readIn.open(filePath, ios::binary);
if (readIn.is_open() == 0) {
cout << "OPEN FAILED!" << endl;
exit(0);
}
//get size of file
readIn.seekg(0, ios::end);
LL fileSize = (LL)readIn.tellg();
readIn.seekg(0, ios::beg);
cout<<"fileSize" <<fileSize<<endl;
//read data in batches, each time read MAX_MEMORY characters
int nTimes = (int)(fileSize / MAX_MEMORY);
if (fileSize % MAX_MEMORY != 0) nTimes++;
int kindsOfCharacter = 0;
cout<<nTimes<<endl;
for (int i = 1; i <= nTimes; i++) {
char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char));
LL numberOfCharacter = MAX_MEMORY;
if (i == nTimes) {
numberOfCharacter = fileSize % MAX_MEMORY;
}
readIn.read(str, numberOfCharacter * sizeof(char));
str[numberOfCharacter] = '\0';
cout<<strlen(str)<<endl;
//count the frequency of each character.
int lenStr = strlen(str);
for (LL j = 0; j < lenStr; j++) {
node[str[j]].number++;
node[str[j]].c = str[j];
}
free(str);
}
// CountKinds();
//build Huffman tree
int numberOfNode = BuildHuffmanTree();
//compress file using Huffman code
CompressFile(filePath, compressFilePath, numberOfNode);
//outPut.close();
// readIn.close();
}
int BuildHuffmanTree(){
//apply 2 * KIND_OF_CHARACTER to store nodes of the Huffman tree
Node* HT = (Node *)malloc((2 * KIND_OF_CHARACTER) * sizeof(Node));
//put all kinds of character into priority queue
priority_queue<Node> q;
int numberOfNode = 0;
for (int i = 0; i < KIND_OF_CHARACTER; i++) {
if (node[i].number != 0) {
node[i].iNode = numberOfNode;
node[i].c = i;
q.push(node[i]);
HT[numberOfNode] = node[i];
numberOfNode++;
}
}
cout << numberOfNode << endl;
int jNode = numberOfNode;
while (q.size() > 1){
//get two minimal weight nodes and set their parent
Node leftNode = q.top();
q.pop();
Node rightNode = q.top();
q.pop();
//cout <<" ##"<< leftNode.number <<endl;
//cout <<" **"<< rightNode.number <<endl;
int l = leftNode.iNode;
int r = rightNode.iNode;
HT[l].parent = jNode;
HT[r].parent = jNode;
//set parent's information
HT[jNode].c = ' ';
HT[jNode].iNode = jNode;
HT[jNode].lChild = l;
HT[jNode].rChild = r;
HT[jNode].number = leftNode.number + rightNode.number;
q.push(HT[jNode]);
jNode++;
}
HT[jNode-1].parent = -1;
/* for (int i = 0; i < jNode; i++){
cout << i << " " << HT[i].c << " " << HT[i].number<< endl;
}*/
//get each character's Huffman code
for (int i = 0; i < numberOfNode; i++) {
int k = 0;
int l = i;
char ch = HT[i].c;
for (int j = HT[i].parent; j != -1; j = HT[j].parent) {
if (HT[j].lChild == l) {
HuffmanCode[ch][k] = '0';
}
else {
HuffmanCode[ch][k] = '1';
}
l = j;
k++;
}
//reverse the Huffman code
for (int j = 0; j < k / 2; j++) {
char temp = HuffmanCode[ch][j];
HuffmanCode[ch][j] = HuffmanCode[ch][k-1-j];
HuffmanCode[ch][k-1-j] = temp;
}
HuffmanCode[ch][k] = '\0';
cout << ch << " " <<HuffmanCode[ch] << endl;
}
cout<<numberOfNode<<endl;
free(HT);
return numberOfNode;
}
void CompressFile(const char *filePath, const char *outPutFilePath, int numberOfNode){
//scan characters in input file once more
ifstream readIn;
readIn.open(filePath, ios::binary);
if (readIn.is_open() == 0) {
cout << "OPEN FAILED!" << endl;
exit(0);
}
//write Huffman code file
//Information: number of bits added, OFFSET, size of original file. the number of kinds of character
ofstream outPut;
outPut.open(outPutFilePath, ios::binary);
if (outPut.is_open() == 0) {
cout << "OPEN FAILED!" << endl;
exit(0);
}
//get size of file
readIn.seekg(0, ios::end);
LL fileSize = (LL)readIn.tellg();
readIn.seekg(0, ios::beg);
//write some information in compressed file
outPut.seekp(OFFSET, ios::beg);
outPut.write((char *)&fileSize, sizeof(LL));
outPut.write((char *)&numberOfNode, sizeof(int));
//record the character and its Huffman code
for (int i = 0; i < KIND_OF_CHARACTER; i++) {
if (node[i].number != 0) {
outPut.write((char *)&i, sizeof(char));
int bits = strlen(HuffmanCode[i]);
outPut.write((char *)&bits, sizeof(int));
outPut.write((char *)&HuffmanCode[i], bits*sizeof(char));
}
}
//read data in batches, each time read MAX_MEMORY characters and encode
int nTimes = (int)(fileSize / MAX_MEMORY);
if (fileSize % MAX_MEMORY != 0) nTimes++;
int kindsOfCharacter = 0;
char *HTstr = (char *)calloc(1, (MAX_MEMORY+HUFFMAN_CODE_LENGTH)*sizeof(char));
int len = 0;
LL lenT = 0;
for (int i = 1; i <= nTimes; i++) {
char *str = (char *)calloc(1, (MAX_MEMORY+10)*sizeof(char));
LL numberOfCharacter = MAX_MEMORY;
if (i == nTimes) {
numberOfCharacter = fileSize % MAX_MEMORY;
}
readIn.read(str, numberOfCharacter * sizeof(char));
str[numberOfCharacter] = '\0';
for (LL j = 0; j < numberOfCharacter; j++) {
char ch = str[j];
lenT += strlen(HuffmanCode[ch]);
strcpy(HTstr+len, HuffmanCode[ch]);
len += strlen(HuffmanCode[ch]);
//write compressed file in batches
//when the length of encode string is greater than limited memory
if (len > MAX_MEMORY) {
// cout<<"****"<<endl;
LL leftBits = len % nBits;
LL changeLength = len - leftBits;
BitToInt(outPut, HTstr, changeLength);
//if no left bits, no need to keep it.
strcpy(HTstr, HTstr+changeLength);
len = strlen(HTstr);
}
}
free(str);
}
//cout<<strlen(HTstr)<<" "<<HTstr<<endl;
//if there are left bits, change int integer
if (len != 0) {
BitToInt(outPut, HTstr, len);
//store tail???
}
free(HTstr);
readIn.close();
outPut.close();
}
void BitToInt(ofstream &outPut, char* HTstr, LL len) {
//add 0 to make the length of HTstr can be divide by 7
int k = 0;
if (len % nBits != 0) {
int bitsToAdd = nBits - (len % nBits);
streampos pos = outPut.tellp();
outPut.seekp(0, ios::beg);
outPut.write((char *)&bitsToAdd, sizeof(int));
outPut.write((char *)&OFFSET, sizeof(int));
outPut.seekp(pos, ios::beg);
for (; k < bitsToAdd; k++){
HTstr[len+k] = '0';
}
HTstr[len+k] = '\0';
}
//char *buf = (char *)calloc(1, MAX_MEMORY * sizeof(char));
//convert bit to char
int pow = 1<<(nBits - 1);
int sum = 0;
for (LL i = 0, j = 0; i < len+k && HTstr[i]; i++) {
if (j == nBits){
outPut.write((char *)&sum, sizeof(char));
j = 0;
sum = 0;
}
sum = sum + (HTstr[i]-'0') * (pow >> j);
j++;
}
// outPut.write(buf, strlen(buf) * sizeof(char));
outPut.write((char *)&sum, sizeof(char));
// free(buf);
// cout <<sum <<endl;
}
void CountKinds(){
int kinds = 0;
for (int i = 0; i < KIND_OF_CHARACTER; i++) {
if (node[i].number != 0) {
printf("%c ", node[i].c);
cout << node[i].c << " " << node[i].number<<endl;
kinds++;
}
}
cout << kinds << endl; //76
}
#include <iostream>
#include <fstream>
#include <algorithm>
#include <cstring>
using namespace std;
typedef long long LL;
const int FILE_LENGTH = 1000;
//the maximal length of Huffman code
const int HUFFMAN_CODE_LENGTH = 1000;
//number of kinds of character
const int KIND_OF_CHARACTER = 256;
//maximal bytes which is read from file each time
const long long MAX_MEMORY = 1 * 1024 * 1024;
struct Node {
char c; //character
char Huffmancode[HUFFMAN_CODE_LENGTH]; //bits string
}node[KIND_OF_CHARACTER]; //encoding information
//store each nBits
int nBits = 8;
LL originalFileSize; //the size of original file
int numberOfNode; //number of kind of character
int bitsAdded;
int OFFSET;
int GetCompressInformation(ifstream &readIn);
void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength);
int main() {
char compressFilePath[FILE_LENGTH] = "result.txt"; //graph.txt "1.txt";
char decompressFilePath[FILE_LENGTH] = "decompressResult.txt";
ifstream readIn;
readIn.open(compressFilePath, ios::binary);
if (readIn.is_open() == 0) {
cout << "OPEN FAILED!" << endl;
exit(0);
}
ofstream writeOut;
writeOut.open(decompressFilePath, ios::binary);
if (writeOut.is_open() == 0) {
cout << "OPEN FAILED!" << endl;
exit(0);
}
//get information of compressed file
int maxEncodingLength = GetCompressInformation(readIn);
//decompress File
DecompressFile(readIn, writeOut, maxEncodingLength);
readIn.close();
writeOut.close();
return 0;
}
int GetCompressInformation(ifstream &readIn){
readIn.read((char *)&bitsAdded, sizeof(int));
readIn.read((char *)&OFFSET, sizeof(int));
readIn.seekg(OFFSET, ios::beg);
readIn.read((char *)&originalFileSize, sizeof(LL));
readIn.read((char *)&numberOfNode, sizeof(int));
cout << originalFileSize << " " << numberOfNode << endl;
//record the character and its Huffman code
int maxEncodingLength = 0;
for (int i = 0; i < numberOfNode; i++) {
readIn.read((char *)&node[i].c, sizeof(char));
int bits;
readIn.read((char *)&bits, sizeof(int));
readIn.read((char *)&node[i].Huffmancode, bits*sizeof(char));
node[i].Huffmancode[bits] = '\0';
cout << node[i].c << " " << node[i].Huffmancode << endl;
if (maxEncodingLength < strlen(node[i].Huffmancode)) {
maxEncodingLength = strlen(node[i].Huffmancode);
}
}
cout << " maxEncodingLength :" << maxEncodingLength << endl;
return maxEncodingLength;
}
void DecompressFile(ifstream &readIn, ofstream &writeOut, int maxEncodingLength){
//get size of compressed file
streampos curPos = readIn.tellg();
readIn.seekg(0, ios::end);
LL compressedFileSize = (LL)(readIn.tellg() - curPos);
readIn.seekg(curPos, ios::beg);
cout << "size of compressed file : " << compressedFileSize << endl;
//read data in batches, each time read MAX_MEMORY characters
int nTimes = (int)(compressedFileSize / MAX_MEMORY);
if (compressedFileSize % MAX_MEMORY != 0) nTimes++;
char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
int lenOfChar = 0;
for (int j = 1; j <= nTimes; j++) {
LL numberOfCharacter = MAX_MEMORY;
if (j == nTimes) {
numberOfCharacter = compressedFileSize % MAX_MEMORY;
}
char *strTemp = (char *)calloc(1, (2*HUFFMAN_CODE_LENGTH) * sizeof(char));
char *buf = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
readIn.read(buf, numberOfCharacter * sizeof(char));
//cout<<buf<<endl;
//printf("%d\n", ascII);
int lenOfStrTemp = 0;
for (int k = 0; k < numberOfCharacter; k++) {
// convert it to binary bits
unsigned char ascII = buf[k];
char huffmanString[3*nBits];
for (int i = nBits - 1; i >= 0; i--) {
huffmanString[i] = ascII % 2 + '0';
ascII = ascII / 2;
}
//if read last character, then minus bits which is added
if ((j == nTimes) && (k == numberOfCharacter - 1)) {
// printf("ascII:%d\n", ascII);
nBits = nBits - bitsAdded;
}
huffmanString[nBits] = '\0';
// cout<<huffmanString<<endl;
strcpy(strTemp + lenOfStrTemp, huffmanString);
lenOfStrTemp += strlen(huffmanString);
//convert bit to char
LL comparePosition = 0;
while (1) {
bool flag = false;
for (int z = 0; z < numberOfNode; z++) {
//if(strlen(node[z].Huffmancode) > strlen(strcmp)) continue;
int lenHuffmanCode = strlen(node[z].Huffmancode);
if (!memcmp(node[z].Huffmancode, strTemp, lenHuffmanCode)) {
str[lenOfChar] = node[z].c;
str[lenOfChar+1] = '\0';
lenOfChar ++;
//cout<<"strTempF:"<<strTemp<<endl;
strcpy(strTemp, strTemp+lenHuffmanCode);
lenOfStrTemp = strlen(strTemp);
//cout<<"strTemp:"<<strTemp<<endl;
flag = true;
break;
//comparePosition += lenHuffmanCode;
}
}
if (!flag || (lenOfStrTemp == 0)) break;
}
//if length of str is larger than limited memory, write into decompressed file
if (lenOfChar > MAX_MEMORY) {
writeOut.write(str, lenOfChar * sizeof(char));
//apply a new memory will result in crash
//free(str);
//char *str = (char *)calloc(1, (MAX_MEMORY + HUFFMAN_CODE_LENGTH)* sizeof(char));
strcpy(str, "");
lenOfChar = 0;
}
}
free(buf);
free(strTemp);
}
//cout<<str<<endl;
if (lenOfChar != 0){
writeOut.write(str, lenOfChar * sizeof(char));
free(str);
}
}