huffman压缩简介
哈夫曼编码(Huffman Coding),又称霍夫曼编码,是一种编码方式,该方法完全依据字符出现概率来构造异字头的平均长度最短的码字,有时称之为最佳编码,一般就叫做Huffman编码(有时也称为霍夫曼编码);
因为这个特性,所以我们可以利用其来进行文件压缩,例如:有一个原始数据序列,ABACCDAA则编码为A(0),B(10),C(110),(D111),压缩后为010011011011100,再将其转为一个整形写入,实现了文件压缩;
构建压缩信息
那么这个过程中,我们在类中也就是整个压缩过程中要对其字符对应的信息例如:其对应的字符出现的次数,其字符对应的huffman编码,好方便我们的压缩和解压的过程;
在类中定义一个CharInfo _infos[256]来表示其所有信息;
typedef long long LongType;
struct CharInfo
{
LongType _count;//字符出现的次数
string _code;//huffman编码
unsigned char _ch;//字符
CharInfo(LongType count = 0)
:_count(count)
, _code("")
{}
};
开始压缩
压缩又可以分为以下几步:
统计字符
统计字符出现的次数
FILE* fout = fopen(filename, "rb");
assert(fout);
//char ch = fgetc(fout);
int ch = fgetc(fout);//因为压缩汉字或者图片有些字符可能超过一个char表示的范围,具体与编码方式有关,在此不做过多描述
while (ch != EOF)
{
_infos[(unsigned char)ch]._count++;
ch = fgetc(fout);
}
建立huffman树
CharInfo invalid;
invalid._count = 0;
HuffmanTree<CharInfo> tree(_infos, 256,invalid);
//另外创建一个HuffmanTree.h文件
#pragma once
template <class T>
struct HuffmanTreeNode//huffman节点
{
T _w;
HuffmanTreeNode* _left;
HuffmanTreeNode* _right;
HuffmanTreeNode* _parent;
HuffmanTreeNode(const T& x)
:_w(x)
, _left(NULL)
, _right(NULL)
, _parent(NULL)
{}
};
template <class T>
class HuffmanTree
{
typedef HuffmanTreeNode<T> Node;
public:
HuffmanTree()
:_root(NULL)
{
}
HuffmanTree(T* a, size_t n,const T& invalid)
{
Heap<Node*> minHeap;//利用堆来进行排序的比较
for (size_t i = 0; i < n; i++)
{
if (a[i] != invalid)
minHeap.Push(new Node(a[i]));
}
while (minHeap.Size() > 1)
{
Node* left = minHeap.Top();
minHeap.Pop();
Node* right = minHeap.Top();
minHeap.Pop();
Node* parent = new Node(left->_w + right->_w);
parent->_left = left;
left->_parent = parent;
parent->_right = right;
right->_parent = parent;
minHeap.Push(parent);
}
_root = minHeap.Top();
}
~HuffmanTree()
{
_Destory(_root);
}
Node* Root()
{
return _root;
}
protected:
void _Destory(Node* root)
{
if (root == NULL)
return;
_Destory(root->_left);
_Destory(root->_right);
delete root;
}
HuffmanTree(const HuffmanTree<T>&);//防拷贝
HuffmanTree& operator=(const HuffmanTree<T>&);
protected:
Node* _root;
};
其中利用的堆排序的代码:
//利用仿函数进行大堆小堆
template <class T>
struct Less
{
bool operator()(const T& l, const T& r) const
{
return l->_w._count <= r->_w._count;
}
};
template <class T>
struct Greater
{
bool operator()(const T& l, const T& r) const
{
return l->_w._count >= r->_w._count;
}
};
template <class T, class Compare = Less<T>>
//template <class T>
class Heap
{
public:
Heap()
{}
Heap( T* a, size_t size)
{
_a.reserve(size);
for (size_t i = 0; i < size; ++i)
{
_a.push_back(a[i]);
}
for (int i = (_a.size() - 2) / 2; i >= 0;--i)
_AdjustDown(i);
}
size_t Size()
{
return _a.size();
}
void Push(const T& x)
{
_a.push_back(x);
_AdjustUp(_a.size() - 1);
}
void Pop()
{
assert(!_a.empty());
swap(_a[0], _a[_a.size() - 1]);
_a.pop_back();
_AdjustDown(0);
}
const T& Top() const
{
return _a[0];
}
protected:
void _AdjustDown(int root)
{
Compare comFunc;
int parent = root;
size_t child = root * 2 + 1;
while (child < _a.size())
{
if (child + 1 < _a.size() && comFunc(_a[child + 1],_a[child]))
{
++child;
}
if (comFunc(_a[child] , _a[parent]))
{
swap(_a[child], _a[parent]);
parent = child;
child = parent * 2 + 1;
}
else
{
break;
}
}
}
void _AdjustUp(int child)
{
Compare comFunc;
int parent = (child - 1) >> 1;
while (child > 0)
{
if (comFunc(_a[child], _a[parent]))
//if (_a[child] >= _a[parent])
{
swap(_a[child], _a[parent]);
child = parent;
parent = (child - 1) >> 1;
}
else
{
break;
}
}
}
protected:
vector<T> _a;
};
得到huffman编码
void GetHuffmanCode( HuffmanTreeNode<CharInfo>* root)
{
if (root == NULL)
return ;
if (root->_left == NULL && root->_right == NULL)
{
HuffmanTreeNode<CharInfo> *parent, *cur;
cur = root;
parent = cur->_parent;
string& code = _infos[(unsigned)root->_w._ch]._code;
while (parent)
{
if (cur == parent->_left)
code += '0';
else
code += '1';
cur = parent;
parent = cur->_parent;
}
reverse(code.begin(), code.end());
return;
//_infos[root->_w._ch]._code = code;
}
GetHuffmanCode(root->_left);
GetHuffmanCode(root->_right);
}
将huffman编码压缩
压缩的时候就是根据字符的顺序,将其转化为对应的huffman编码,当满8位的时候就将其写入到文件中;
fseek(fout,0, SEEK_SET);
ch = fgetc(fout);
unsigned char value = 0;
int pos = 0;
while (!feof(fout))
{
string& code = _infos[ch]._code;
for (size_t i = 0; i < (code.size()); ++i)
{
value <<= 1;
++pos;
if (code[i] == '1')//将其相应位置置为1
value |= 1;
if (pos == 8)//满8写入
{
printf("%d ", value);
fputc(value, fin);
value = 0;
pos = 0;
}
}
//cout << ch << " ";
ch = fgetc(fout);
}
if (pos)//如果最后的字符huffman编码不够8位将其补够8位写入
{
value <<= (8 - pos);
fputc(value, fin);
}
书写配置信息
这样的话,压缩就算是完成了,但是在压缩的时候我们还需要写配置信息,也就是需要将我们的CharInfo _infos[256]的信息写入,不然的话,当单独解压缩的时候就没有办法了;
我采取的方法是将其信息写入到文件开头处,一开始解压缩就先读配置信息,这样子就需要一个断点来标识配置信息和文件内容了;
因为其本身的结构体中有string类型,如果直接写入会影响压缩效率,而我们只需要只要字符出现的次数就够了;
struct CountInfo
{
int _ch;
LongType _count;
};
CountInfo info;
for (size_t i = 0; i < 256; ++i)
{
if (_infos[i]._count)
{
info._ch = _infos[i]._ch;
info._count = _infos[i]._count;
fwrite(&info,sizeof(info),1,fin);
}
}
//CharInfo info1;
info._count = -1;//断点
fwrite(&info, sizeof(info), 1, fin);
这样子压缩就算是完成了;
解压缩
解压缩的过程就跟压缩类似了,所以简单了一些;就是先根据配置信息读出文件字符信息,然后重新建树,根据文件中现在的数字的位来找字符,再依次写入就好;
读取配置信息
根据写入的配置信息读取到断点处;
FILE* fout = fopen(filename, "rb");
assert(fout);
FILE* fin = fopen(uncompressFile.c_str(), "wb");
assert(fin);
CountInfo info;
while (1)
{
fread(&info, sizeof(CountInfo), 1, fout);
if (info._count == -1)
break;
_infos[info._ch]._ch = info._ch;
_infos[info._ch]._count = info._count;
}
重新建树
根据配置信息重新构建huffman树
HuffmanTree<CharInfo> tree(_infos, 256, invalid);
还原文件
根据压缩文件读取出来的数字按位,依次去huffman树中找到字符写入文件,需要注意一种就是当文件只有一种字符的时候,特别考虑
if (root->_left == NULL && root->_right == NULL)
{
LongType count1 = count;
while (count1)
{
cout << cur->_w._ch<<" ";
fputc(cur->_w._ch, fin);
count1--;
}
}
while (!feof(fout))
{
int pos = 7;
char test = 1;
while (pos >= 0 )
{
if (value &(test << pos))
cur = cur->_right;
else
cur = cur->_left;
--pos;
if (cur->_left == NULL && cur->_right == NULL)
{
//fflush(stdin);
//cout << cur->_w._ch << " ";
fputc(cur->_w._ch, fin);
cur = root;
count--;
}
}
if (count == 0)
break;
value = fgetc(fout);
}
这个样子整个压缩解压就完成了;
整体源码
下面是整体源码:
test.cpp
#include <iostream>
#include <assert.h>
#include <string>
#include <vector>
#include <windows.h>
using namespace std;
#include "HuffmanTree.h"
#include "compress.h"
#include "Heap.h"
int main()
{
Test1();
return 0;
}
compress.h
#pragma once
typedef long long LongType;
struct CharInfo
{
LongType _count;//字符出现的次数
string _code;//huffman编码
unsigned char _ch;//字符
CharInfo(LongType count = 0)
:_count(count)
, _code("")
{}
bool operator!=(const CharInfo& info) const
{
return _count != info._count;
}
CharInfo operator+(const CharInfo& info)
{
return CharInfo(_count + info._count);
}
};
struct CountInfo
{
int _ch;
LongType _count;
};
class CompressFile
{
public:
CompressFile()//对其字符进行初始化
{
for (size_t i = 0; i <= 256; i++)
{
_infos[i]._ch = i;
}
}
void Compress(const char* filename)
{
assert(filename);
FILE* fout = fopen(filename, "rb");
assert(fout);
//char ch = fgetc(fout);
int ch = fgetc(fout);
while (ch != EOF)
{
_infos[(unsigned char)ch]._count++;
ch = fgetc(fout);
}
//fclose(fout);
CharInfo invalid;
invalid._count = 0;
HuffmanTree<CharInfo> tree(_infos, 256,invalid);
GetHuffmanCode(tree.Root());
string compressFile = filename;
compressFile += ".huffman";
FILE* fin = fopen(compressFile.c_str(), "wb");
assert(fin);
CountInfo info;
for (size_t i = 0; i < 256; ++i)
{
if (_infos[i]._count)
{
info._ch = _infos[i]._ch;
info._count = _infos[i]._count;
fwrite(&info,sizeof(info),1,fin);
}
}
//CharInfo info1;
info._count = -1;
fwrite(&info, sizeof(info), 1, fin);
fseek(fout,0, SEEK_SET);
ch = fgetc(fout);
unsigned char value = 0;
int pos = 0;
while (!feof(fout))
{
string& code = _infos[ch]._code;
for (size_t i = 0; i < (code.size()); ++i)
{
value <<= 1;
++pos;
if (code[i] == '1')
value |= 1;
if (pos == 8)
{
printf("%d ", value);
fputc(value, fin);
value = 0;
pos = 0;
}
}
//cout << ch << " ";
ch = fgetc(fout);
}
if (pos)
{
value <<= (8 - pos);
fputc(value, fin);
}
fclose(fin);
fclose(fout);
}
void UnCompress(const char* filename)//当这个文件中只有一种字符时,只有一个根节点
{
assert(filename);
string uncompressFile = filename;
size_t index = uncompressFile.rfind('.');
assert(index != string::npos);
uncompressFile = uncompressFile.substr(0, index);
index = uncompressFile.rfind('.');
assert(index != string::npos);
uncompressFile = uncompressFile.substr(0, index);
uncompressFile += "1.jpg";
FILE* fout = fopen(filename, "rb");
assert(fout);
FILE* fin = fopen(uncompressFile.c_str(), "wb");
assert(fin);
CountInfo info;
while (1)
{
fread(&info, sizeof(CountInfo), 1, fout);
if (info._count == -1)
break;
_infos[info._ch]._ch = info._ch;
_infos[info._ch]._count = info._count;
}
CharInfo invalid;
invalid._count = 0;
HuffmanTree<CharInfo> tree(_infos, 256, invalid);
HuffmanTreeNode<CharInfo>* root = tree.Root();
HuffmanTreeNode<CharInfo>* cur = root;
LongType count = root->_w._count;
unsigned char value = fgetc(fout);
if (root->_left == NULL && root->_right == NULL)
{
LongType count1 = count;
while (count1)
{
cout << cur->_w._ch<<" ";
fputc(cur->_w._ch, fin);
count1--;
}
}
while (!feof(fout))
{
int pos = 7;
char test = 1;
while (pos >= 0 )
{
if (value &(test << pos))
cur = cur->_right;
else
cur = cur->_left;
--pos;
if (cur->_left == NULL && cur->_right == NULL)
{
//fflush(stdin);
//cout << cur->_w._ch << " ";
fputc(cur->_w._ch, fin);
cur = root;
count--;
}
}
if (count == 0)
break;
value = fgetc(fout);
}
fclose(fout);
fclose(fin);
}
protected:
void GetHuffmanCode( HuffmanTreeNode<CharInfo>* root)
{
if (root == NULL)
return ;
if (root->_left == NULL && root->_right == NULL)
{
HuffmanTreeNode<CharInfo> *parent, *cur;
cur = root;
parent = cur->_parent;
string& code = _infos[(unsigned)root->_w._ch]._code;
while (parent)
{
if (cur == parent->_left)
code += '0';
else
code += '1';
cur = parent;
parent = cur->_parent;
}
reverse(code.begin(), code.end());
return;
//_infos[root->_w._ch]._code = code;
}
GetHuffmanCode(root->_left);
GetHuffmanCode(root->_right);
}
protected:
CharInfo _infos[256];
};
void Test1()
{
CompressFile c1;
cout << "压缩中:" << endl;
c1.Compress("psb.jpg");
cout << "----------------------------------------------------------------" << endl;
cout << "解压中" << endl;
//c1.UnCompress("psb.jpg.huffman");
CompressFile c2;
c1.UnCompress("psb.jpg.huffman");
}
HuffmanTree.h
#pragma once
template <class T>
struct HuffmanTreeNode
{
T _w;
HuffmanTreeNode* _left;
HuffmanTreeNode* _right;
HuffmanTreeNode* _parent;
HuffmanTreeNode(const T& x)
:_w(x)
, _left(NULL)
, _right(NULL)
, _parent(NULL)
{}
};
template <class T>
class HuffmanTree
{
typedef HuffmanTreeNode<T> Node;
public:
HuffmanTree()
:_root(NULL)
{
}
HuffmanTree(T* a, size_t n,const T& invalid)
{
Heap<Node*> minHeap;
for (size_t i = 0; i < n; i++)
{
if (a[i] != invalid)
minHeap.Push(new Node(a[i]));
}
while (minHeap.Size() > 1)
{
Node* left = minHeap.Top();
minHeap.Pop();
Node* right = minHeap.Top();
minHeap.Pop();
Node* parent = new Node(left->_w + right->_w);
parent->_left = left;
left->_parent = parent;
parent->_right = right;
right->_parent = parent;
minHeap.Push(parent);
}
_root = minHeap.Top();
}
~HuffmanTree()
{
_Destory(_root);
}
Node* Root()
{
return _root;
}
protected:
void _Destory(Node* root)
{
if (root == NULL)
return;
_Destory(root->_left);
_Destory(root->_right);
delete root;
}
HuffmanTree(const HuffmanTree<T>&);//防拷贝
HuffmanTree& operator=(const HuffmanTree<T>&);
protected:
Node* _root;
};
Heap.h
#pragma once
template <class T>
struct Less
{
bool operator()(const T& l, const T& r) const
{
return l->_w._count <= r->_w._count;
}
};
template <class T>
struct Greater
{
bool operator()(const T& l, const T& r) const
{
return l->_w._count >= r->_w._count;
}
};
template <class T, class Compare = Less<T>>
//template <class T>
class Heap
{
public:
Heap()
{}
Heap( T* a, size_t size)
{
_a.reserve(size);
for (size_t i = 0; i < size; ++i)
{
_a.push_back(a[i]);
}
for (int i = (_a.size() - 2) / 2; i >= 0;--i)
_AdjustDown(i);
}
size_t Size()
{
return _a.size();
}
void Push(const T& x)
{
_a.push_back(x);
_AdjustUp(_a.size() - 1);
}
void Pop()
{
assert(!_a.empty());
swap(_a[0], _a[_a.size() - 1]);
_a.pop_back();
_AdjustDown(0);
}
const T& Top() const
{
return _a[0];
}
protected:
void _AdjustDown(int root)
{
Compare comFunc;
int parent = root;
size_t child = root * 2 + 1;
while (child < _a.size())
{
if (child + 1 < _a.size() && comFunc(_a[child + 1],_a[child]))
{
++child;
}
if (comFunc(_a[child] , _a[parent]))
{
swap(_a[child], _a[parent]);
parent = child;
child = parent * 2 + 1;
}
else
{
break;
}
}
}
void _AdjustUp(int child)
{
Compare comFunc;
int parent = (child - 1) >> 1;
while (child > 0)
{
if (comFunc(_a[child], _a[parent]))
//if (_a[child] >= _a[parent])
{
swap(_a[child], _a[parent]);
child = parent;
parent = (child - 1) >> 1;
}
else
{
break;
}
}
}
protected:
vector<T> _a;
};