哈夫曼编码在 UE4 中的实现

导语
在UE4 中实现哈夫曼编码的本意是在网络数据传输的时候，进行数据包一定程度的数据压缩。但是实现完之后发现并无用武之地，一个是因为压缩的数据中还必须要包含编码表，对于高频、小数据包反而会增加数据的大小和增加编码解码的计算消耗。二是因为好多要传输的数据本身就是已经编码压缩过了，再次编码压缩已经没有效果了（比如UE4 的 RenderTarget 图像数据和 VoiceData）。所以权当是一次代码练习吧。

关于哈夫曼编码

Huffman于1952年提出一种编码方法，该方法完全依据出现概率来构造异字头的平均长度最短的码字。简单来说就是用最少的bit来表示出现频率最高的字符，生成一张编码对照表，解码的时候根据对照表依次还原就行。
比如 “aabccade”，其中总共有a、b、c、d、e 五个字符，a出现3次，c出现2次，b、d、e出现1次，
那么可以用0表示a，用1表示c，用10表示b，用11表示d，用100表示e。所以原字符就可以用

1	0-0-10-1-1-0-11-100

来表示了。当然具体实现细节要更复杂一点，涉及到二叉树，前缀码等。

好了，话不多说，上代码！

HuffmanEncode.h

#pragma once
#include "CoreMinimal.h"
#include <queue>
#include <string>

// 设置1
#define SETBIT(souce, index) souce |= (1 << index)
// 设置0
#define CLEARBIT(souce, index) souce &= ~(1 << index)
// 位取反
#define REVERSEBIT(souce, index) souce ^= (1 << index)
// 获取位
#define GETBIT(souce, index) ((souce) >> (index) & 1)

using namespace std;
DECLARE_LOG_CATEGORY_EXTERN(HuffmanEncode, Log, All);

namespace Huffman
{

	// 定义一颗 哈夫曼 树节点
	class Node {
	public:
		unsigned char ch = '\0'; // 该节点中的字符
		int freq = 0; // 该节点字符出现的次数（频率）
		Node* left = nullptr; // 左叶子节点
		Node* right = nullptr; // 右叶子节点

		Node(unsigned char _c, int f, Node* l = nullptr, Node* r = nullptr)
			:ch(_c), freq(f), left(l), right(r)
		{}

		~Node()
		{
			// cout << "~ Node " << this->ch << endl;
		}

		// 重载 < 运算符，在加入优先队列的时候决定如何处理结点位置
		bool operator<(const Node& node) const
		{ 
			return freq > node.freq;
		}

		bool isLeaf()
		{
			return left == nullptr && right == nullptr;
		}
	};


	/**
	* 从输入流中读字节流，并将编码后的结果写入输出流
	*/
	void Encode(unsigned char* inData, const int dataNum, unsigned char* outData, int& outDataNum);

	/**
	 * 解码
	 * 读取编码的比特流，
	 * 将比特流对应为路径在单词查找树上找，将找到的结点中的字符写出
	 */
	void Decode(unsigned char* indata, int dataSize, unsigned char* outData, int& outDataSize);


	// ============= 辅助函数 ============ 

	/**
	 * 删除
	 */
	void delTree(Node* root);

	/**
	 * 读比特流，得出一颗单词查找树
	 */
	Node* readTree(unsigned char* indata, int& nodeIndex);

	/**
	 * 将单词查找树编码成比特输出串并写入到输出流
	 */
	void writeTree(Node* x, int& count, unsigned char* tree);

	/**
	 * 构建编译表，每个char值与一个比特字符串（即Huffman树上路径）的对照表
	 */
	void buildCode(string st[], Node* x, string s); // st[63] = "001"

	/**
	 * @param freq 字符出现的次数
	 */
	Node* buildTree(int freq[], int& outLeafNum);
}

HuffmanEncode.cpp

// Fill out your copyright notice in the Description page of Project Settings.

#include "HuffmanEncode.h"
#include "Engine.h"



using namespace std;
DEFINE_LOG_CATEGORY(HuffmanEncode);

namespace Huffman
{

	void Encode(unsigned char* inData, const int dataNum, unsigned char* outData, int& outDataNum)
	{

		//1. 计算【ascii 0-255】每个字符出现的次数，数组的index就是字符的值，数组的值是字符出现的次数
		int freq[256] = { 0 };
		for (int i = 0; i < dataNum; i++) {
			freq[inData[i]]++;
		}

		//2. 构建Huffman树
		int leafNum = 0; // 叶子节点数量
		Node* root = buildTree(freq, leafNum);

		//3. 构建编译表，每个char值与一个比特字符串（即Huffman树上路径）的对照表
		string st[256] = { "" };
		buildCode(st, root, "");


		//4. 将单词查找树编码成比特输出串并写入到buff
		short int treeByteNum = leafNum * 3 - 1;
		unsigned char* tree = &outData[1];
		int currentByteIndex = 0; // 当前写到第几个字节
		writeTree(root, currentByteIndex, tree);

		// 用前面 1 个字节存储最后个1字节写到第几个bit,从0开始算起
		// outData[0] = xxx;


		//5. 将单词总数编码成比特输出串并写入到输出流
		int outBitCount = 0; // 写当前字节的第几位
		unsigned char* outBitBuff = &outData[treeByteNum + 1];
		int outBuffCount = 0; // 当前在写第几个字节

		for (int i = 0; i < dataNum; i++) {
			string code = st[inData[i]];   //code表示Huffman单词查找数上的路径
			for (int j = 0; j < code.length(); j++) {  //要一位一位地输出
				if (code[j] == '1')
				{
					SETBIT(outBitBuff[outBuffCount], outBitCount);
					outBitCount++;
					if (outBitCount == 8) { outBitCount = 0; outBuffCount++; }
				}
				else
				{
					CLEARBIT(outBitBuff[outBuffCount], outBitCount);
					outBitCount++;
					if (outBitCount == 8) { outBitCount = 0; outBuffCount++; }
				}
			}
		}

		// 已经压缩的字节总数
		int inDataEncodeByte = outBuffCount + (outBitCount > 0 ? 1 : 0);

		// 用前面 1 个字节存储最后个1字节写到第几个bit,从0开始算起
		outData[0] = outBitCount;

		outDataNum = 1 + treeByteNum + inDataEncodeByte;
		UE_LOG(HuffmanEncode, Warning, TEXT("======= Encode finished ========="));
		delTree(root);  // 清空内存
	}

	void Decode(unsigned char* indata, int dataSize, unsigned char* outData, int& outDataSize)
	{

		// 1，获取最后一个字节需要读到第几个bit
		int lastByteBitCount = indata[0];


		// 2，读出查找树
		int index = -1;
		unsigned char* treeData = &indata[1];
		Node* root = readTree(treeData, index);


		// 3，
		unsigned char* encodedData = &indata[index + 2];
		int N = dataSize - index - 2;  //读出存在压缩文件中的字符串长度

		outDataSize = 0;
		Node* x = root;
		for (int i = 0; i < N; i++) {   //找出源文件中每个字符
			for (int j = 0; j < 8; j++)
			{
				if (i == N - 1 && j == lastByteBitCount + 1) // 
				{
					break;
				}
				if (!x->isLeaf()) //遍历，直到叶子结点
				{
					if (GETBIT(encodedData[i], j)) {
						x = x->right;
					}
					else {
						x = x->left;
					}
				}
				else
				{
					// cout << " decode is " << x->ch << endl;
					outData[outDataSize] = x->ch;
					outDataSize++;
					j--;
					x = root; // 重新从根节点出发
				}
			}
		}

		UE_LOG(HuffmanEncode, Warning, TEXT("======= Decode finished ========="));
		delTree(root); // 清空内存
	}

	void delTree(Node* root)
	{
		if (!root->isLeaf()) //读到1，说明是叶子结点
		{
			delTree(root->left);
			delTree(root->right);
		}
		delete root;
	}

	Node* readTree(unsigned char* indata, int& nodeIndex)
	{
		nodeIndex++;

		if (indata[nodeIndex] == 1) //读到1，说明是叶子结点
		{
			nodeIndex++;
			return new Node(indata[nodeIndex], 0, nullptr, nullptr);
		}
		else//读到的是0，说明是中间结点，需要递归直到读到1为止
		{
			Node* left = readTree(indata, nodeIndex);
			Node* right = readTree(indata, nodeIndex);
			return new Node('\0', 0, left, right);
		}
	}

	void writeTree(Node* x, int& count, unsigned char* tree)
	{
		if (x->isLeaf()) {
			tree[count] = 1; count++;
			tree[count] = x->ch;
			count++;
			return;
		}
		tree[count] = 0; count++;
		writeTree(x->left, count, tree);
		writeTree(x->right, count, tree);
	}

	void buildCode(string st[], Node* x, string s)
	{
		if (!x->isLeaf()) {
			buildCode(st, x->left, s + "0");
			buildCode(st, x->right, s + "1");
		}
		else {
			st[x->ch] = s;
		}
	}

	Node* buildTree(int freq[], int& outLeafNum)
	{

		// 最小列队
		priority_queue<Node> pq;

		//初始化多个将构成一颗Huffman树的结点
		for (int i = 0; i < 256; i++) {
			if (freq[i] > 0) pq.push(Node(i, freq[i], nullptr, nullptr));
		}

		outLeafNum = pq.size();

		// special case in case there is only one character with a nonzero frequency
		if (pq.size() == 1) {
			if (freq['\0'] == 0) pq.push(Node('\0', 0, nullptr, nullptr));
			else pq.push(Node('\1', 0, nullptr, nullptr));
		}

		//归并两个小树
		while (pq.size() > 1) {
			Node* left = new Node(pq.top()); pq.pop();
			Node* right = new Node(pq.top()); pq.pop();

			Node* parent = new Node('\0', left->freq + right->freq, left, right);//创建连接子树的中间结点
			pq.push(*parent);
		}

		Node* root = new Node(pq.top()); pq.pop();
		return root;
	}
}

蓝图代码：

TArray<uint8> UBPFLibrary::HFM_Encode(TArray<uint8> indata)
{
    uint8* outdata = new uint8[indata.Num()* 3]{ 0 };
    int outdatasize =0;
    Huffman::Encode(indata.GetData),indata.Numo,outdata, outdatasize);
    TArray<uint8> outdatal;
    outdata1.Init(o,outdatasize);
    FMemory::Memcpy(outdata1.GetData(),outdata,outdatasize);
    return outdatal;
}

TArray<uint8>UBPFLibrary::HFM_Decode(TArray<uint8> indata)
{
    uint8* outdata = new uint8[indata.Num)* 3]{ e };
    int outdatasize = 0;
    Huffman::Decode(indata.GetData(),indata.Num(),outdata, outdatasize);
    TArray<uint8>outdatal;
    outdata1.Init(e,outdatasize);
    FMemory::Memcpy(outdata1.GetData(),outdata,outdatasize);
    return outdatal;
}