利用Trie树对字符串集合进行排序并计算特征值

时间:2023-03-08 15:36:30
利用Trie树对字符串集合进行排序并计算特征值

该算法用于将一组乱序的字符串反序列化到一个Trie树中,这个过程即可视为对字符串进行了一次排序。

还可以通过调用 GetFeatureString 将该 Trie 树重新序列化。

 #include <stdio.h>
#include <stdlib.h>
#include <string.h> #ifndef bool
# define bool char
#endif #ifndef true
# define true
#endif #ifndef false
# define false
#endif #define NEXTSIZE 256 struct TrieTreeNode
{
struct TrieTreeNode *Next[NEXTSIZE];
bool Accepted;
}; struct TrieTreeRoot
{
int NodeCount;
struct TrieTreeNode *Tree;
}; struct TrieTreeRoot *BuildTrieTree();
void InsertItem(struct TrieTreeRoot *TrieTreeRoot, char *Item);
unsigned char *GetFeatureString(struct TrieTreeRoot *TrieTreeRoot, int *StringLength); /*
* 构建 Trie 树并初始化
* 返回一个新的 Trie 根节点
*/
struct TrieTreeRoot *BuildTrieTree()
{
struct TrieTreeRoot *Root = (struct TrieTreeRoot *)malloc(sizeof(struct TrieTreeRoot));
Root->NodeCount = ;
Root->Tree = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode));
memset(Root->Tree, '\0', sizeof(struct TrieTreeNode));
return Root;
} /*
* 插入新的字符串
* Root : struct TrieTreeRoot* 要操作的 Trie 树根节点
* Item : char* 要插入的字符串
*/
void InsertItem(struct TrieTreeRoot *Root, char *Item)
{
struct TrieTreeNode *Ptr = Root->Tree;
int index = ;
unsigned char Charactor; while ((Charactor = Item[index]) != '\0')
{
if (Ptr->Next[Charactor] == NULL)
{
Ptr->Next[Charactor] = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode));
memset(Ptr->Next[Charactor], '\0', sizeof(struct TrieTreeNode));
Root->NodeCount++;
}
Ptr = Ptr->Next[Charactor];
index++;
} Ptr->Accepted = true;
} /*
* 递归序列化 Trie 树
* Node : struct TrieTreeNode* 当前操作的 Trie 节点
* WritePtr : unsigned char* 特征串写入指针
*/
unsigned char *DoFeature(struct TrieTreeNode *Node, unsigned char *WritePtr)
{
int i, count = ;
unsigned char *ErgodicPtr; *WritePtr = (unsigned char)Node->Accepted; // 写入节点是否接受
WritePtr++; ErgodicPtr = WritePtr; // 记录集合起始地址 for (i = ; i < NEXTSIZE; i++) // 将该组记录写入特征串
{
if (Node->Next[i] != NULL)
{
*WritePtr = (char)i;
WritePtr++;
count++;
}
} *WritePtr = '\0'; // 写入组分隔符
WritePtr++; for (i = ; i < count; i++) // 递归调用处理所有边
{
WritePtr = DoFeature(Node->Next[ErgodicPtr[i]], WritePtr);
} return WritePtr;
} /*
* 取得 Trie 的特征串,即序列化 Trie 树
* Root : struct TrieTreeRoot* 要操作的 Trie 树根节点
* StringLength : int* 长度指针(为了返回二进制串而设置)
*/
unsigned char *GetFeatureString(struct TrieTreeRoot *Root, int *StringLength)
{
struct TrieTreeNode *Ptr = Root->Tree;
// 假设最坏情况下,每个节点只有一条边,那么存储该节点就需要三个单元(Accepted、边、分隔符)
// 但实际上真正用到的只有 3N-1 个字节
unsigned char *FeatureString = (unsigned char *)malloc(Root->NodeCount * );
unsigned char *WritePtr = FeatureString; WritePtr = DoFeature(Ptr, WritePtr); *StringLength = WritePtr - FeatureString;
return FeatureString;
} void Test_1()
{
struct TrieTreeRoot *t = BuildTrieTree();
InsertItem(t, "P(\376P)\377");
InsertItem(t, "P(\376)\377");
InsertItem(t, "P(\376P)(");
InsertItem(t, "P(\376)(");
InsertItem(t, "P\376(P))");
InsertItem(t, "P\376())");
int l = , i;
unsigned char *s = GetFeatureString(t, &l);
printf("Feature: Size=%d, NodeCount=%d\n", l, t->NodeCount);
for (i = ; i < l; i++)
{
printf("%X ", s[i]);
}
printf("\n");
} void Test_2()
{
struct TrieTreeRoot *t = BuildTrieTree();
InsertItem(t, "P(\376)(");
InsertItem(t, "P(\376P)\377");
InsertItem(t, "P(\376P)(");
InsertItem(t, "P(\376(\377");
InsertItem(t, "P(\376P)\377");
InsertItem(t, "P\376())");
InsertItem(t, "P(\376)\377");
InsertItem(t, "P\376(P))");
int l = , i;
unsigned char *s = GetFeatureString(t, &l);
printf("Feature: Size=%d, NodeCount=%d\n", l, t->NodeCount);
for (i = ; i < l; i++)
{
printf("%X ", s[i]);
}
printf("\n");
} int main(int argc, char **argv)
{
Test_1();
Test_2();
return ;
}

仍有两个地方可以进行优化:

1、将 next 数组改为指针,有效减少叶子节点占用的空间;

2、如果插入的字符串是固定的,那么可以通过第一遍扫描该组字符串,构建一个大小为256的字典,通过代码 next[dic[charactor]] 进行访问,可有效减少边的数量。