利用Trie树对字符串集合进行排序并计算特征值

时间:2022-03-07 22:31:17

该算法用于将一组乱序的字符串反序列化到一个Trie树中,这个过程即可视为对字符串进行了一次排序。

还可以通过调用 GetFeatureString 将该 Trie 树重新序列化。

  1 #include <stdio.h>
  2 #include <stdlib.h>
  3 #include <string.h>
  4 
  5 #ifndef bool
  6 #    define bool char
  7 #endif
  8 
  9 #ifndef true
 10 #    define true 1
 11 #endif
 12 
 13 #ifndef false
 14 #    define false 0
 15 #endif
 16 
 17 #define NEXTSIZE 256
 18 
 19 struct TrieTreeNode
 20 {
 21     struct TrieTreeNode *Next[NEXTSIZE];
 22     bool Accepted;
 23 };
 24 
 25 struct TrieTreeRoot
 26 {
 27     int NodeCount;
 28     struct TrieTreeNode *Tree;
 29 };
 30 
 31 struct TrieTreeRoot *BuildTrieTree();
 32 void InsertItem(struct TrieTreeRoot *TrieTreeRoot, char *Item);
 33 unsigned char *GetFeatureString(struct TrieTreeRoot *TrieTreeRoot, int *StringLength);
 34 
 35 /*
 36  * 构建 Trie 树并初始化
 37  * 返回一个新的 Trie 根节点
 38  */
 39 struct TrieTreeRoot *BuildTrieTree()
 40 {
 41     struct TrieTreeRoot *Root = (struct TrieTreeRoot *)malloc(sizeof(struct TrieTreeRoot));
 42     Root->NodeCount = 1;
 43     Root->Tree = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode));
 44     memset(Root->Tree, '\0', sizeof(struct TrieTreeNode));
 45     return Root;
 46 }
 47 
 48 /*
 49  * 插入新的字符串
 50  * Root : struct TrieTreeRoot* 要操作的 Trie 树根节点
 51  * Item : char* 要插入的字符串
 52  */
 53 void InsertItem(struct TrieTreeRoot *Root, char *Item)
 54 {
 55     struct TrieTreeNode *Ptr = Root->Tree;
 56     int index = 0;
 57     unsigned char Charactor;
 58 
 59     while ((Charactor = Item[index]) != '\0')
 60     {
 61         if (Ptr->Next[Charactor] == NULL)
 62         {
 63             Ptr->Next[Charactor] = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode));
 64             memset(Ptr->Next[Charactor], '\0', sizeof(struct TrieTreeNode));
 65             Root->NodeCount++;
 66         }
 67         Ptr = Ptr->Next[Charactor];
 68         index++;
 69     }
 70 
 71     Ptr->Accepted = true;
 72 }
 73 
 74 /*
 75  * 递归序列化 Trie 树
 76  * Node : struct TrieTreeNode* 当前操作的 Trie 节点
 77  * WritePtr : unsigned char* 特征串写入指针
 78  */
 79 unsigned char *DoFeature(struct TrieTreeNode *Node, unsigned char *WritePtr)
 80 {
 81     int i, count = 0;
 82     unsigned char *ErgodicPtr;
 83 
 84     *WritePtr = (unsigned char)Node->Accepted;    // 写入节点是否接受
 85     WritePtr++;
 86 
 87     ErgodicPtr = WritePtr;                // 记录集合起始地址
 88 
 89     for (i = 0; i < NEXTSIZE; i++)        // 将该组记录写入特征串
 90     {
 91         if (Node->Next[i] != NULL)
 92         {
 93             *WritePtr = (char)i;
 94             WritePtr++;
 95             count++;
 96         }
 97     }
 98 
 99     *WritePtr = '\0';                    // 写入组分隔符
100     WritePtr++;
101 
102     for (i = 0; i < count; i++)            // 递归调用处理所有边
103     {
104         WritePtr = DoFeature(Node->Next[ErgodicPtr[i]], WritePtr);
105     }
106 
107     return WritePtr;
108 }
109 
110 /*
111  * 取得 Trie 的特征串,即序列化 Trie 树
112  * Root : struct TrieTreeRoot* 要操作的 Trie 树根节点
113  * StringLength : int* 长度指针(为了返回二进制串而设置)
114  */
115 unsigned char *GetFeatureString(struct TrieTreeRoot *Root, int *StringLength)
116 {
117     struct TrieTreeNode *Ptr = Root->Tree;
118     // 假设最坏情况下,每个节点只有一条边,那么存储该节点就需要三个单元(Accepted、边、分隔符)
119     // 但实际上真正用到的只有 3N-1 个字节
120     unsigned char *FeatureString = (unsigned char *)malloc(Root->NodeCount * 3);
121     unsigned char *WritePtr = FeatureString;
122 
123     WritePtr = DoFeature(Ptr, WritePtr);
124 
125     *StringLength = WritePtr - FeatureString;
126     return FeatureString;
127 }
128 
129 void Test_1()
130 {
131     struct TrieTreeRoot *t = BuildTrieTree();
132     InsertItem(t, "P(\376P)\377");
133     InsertItem(t, "P(\376)\377");
134     InsertItem(t, "P(\376P)(");
135     InsertItem(t, "P(\376)(");
136     InsertItem(t, "P\376(P))");
137     InsertItem(t, "P\376())");
138     int l = 0, i;
139     unsigned char *s = GetFeatureString(t, &l);
140     printf("Feature: Size=%d, NodeCount=%d\n", l, t->NodeCount);
141     for (i = 0; i < l; i++)
142     {
143         printf("%X ", s[i]);
144     }
145     printf("\n");
146 }
147 
148 void Test_2()
149 {
150     struct TrieTreeRoot *t = BuildTrieTree();
151     InsertItem(t, "P(\376)(");
152     InsertItem(t, "P(\376P)\377");
153     InsertItem(t, "P(\376P)(");
154     InsertItem(t, "P(\376(\377");
155     InsertItem(t, "P(\376P)\377");
156     InsertItem(t, "P\376())");
157     InsertItem(t, "P(\376)\377");
158     InsertItem(t, "P\376(P))");
159     int l = 0, i;
160     unsigned char *s = GetFeatureString(t, &l);
161     printf("Feature: Size=%d, NodeCount=%d\n", l, t->NodeCount);
162     for (i = 0; i < l; i++)
163     {
164         printf("%X ", s[i]);
165     }
166     printf("\n");
167 }
168 
169 int main(int argc, char **argv)
170 {
171     Test_1();
172     Test_2();
173     return 0;
174 }

仍有两个地方可以进行优化:

1、将 next 数组改为指针,有效减少叶子节点占用的空间;

2、如果插入的字符串是固定的,那么可以通过第一遍扫描该组字符串,构建一个大小为256的字典,通过代码 next[dic[charactor]] 进行访问,可有效减少边的数量。