该算法用于将一组乱序的字符串反序列化到一个Trie树中,这个过程即可视为对字符串进行了一次排序。
还可以通过调用 GetFeatureString 将该 Trie 树重新序列化。
1 #include <stdio.h> 2 #include <stdlib.h> 3 #include <string.h> 4 5 #ifndef bool 6 # define bool char 7 #endif 8 9 #ifndef true 10 # define true 1 11 #endif 12 13 #ifndef false 14 # define false 0 15 #endif 16 17 #define NEXTSIZE 256 18 19 struct TrieTreeNode 20 { 21 struct TrieTreeNode *Next[NEXTSIZE]; 22 bool Accepted; 23 }; 24 25 struct TrieTreeRoot 26 { 27 int NodeCount; 28 struct TrieTreeNode *Tree; 29 }; 30 31 struct TrieTreeRoot *BuildTrieTree(); 32 void InsertItem(struct TrieTreeRoot *TrieTreeRoot, char *Item); 33 unsigned char *GetFeatureString(struct TrieTreeRoot *TrieTreeRoot, int *StringLength); 34 35 /* 36 * 构建 Trie 树并初始化 37 * 返回一个新的 Trie 根节点 38 */ 39 struct TrieTreeRoot *BuildTrieTree() 40 { 41 struct TrieTreeRoot *Root = (struct TrieTreeRoot *)malloc(sizeof(struct TrieTreeRoot)); 42 Root->NodeCount = 1; 43 Root->Tree = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode)); 44 memset(Root->Tree, '\0', sizeof(struct TrieTreeNode)); 45 return Root; 46 } 47 48 /* 49 * 插入新的字符串 50 * Root : struct TrieTreeRoot* 要操作的 Trie 树根节点 51 * Item : char* 要插入的字符串 52 */ 53 void InsertItem(struct TrieTreeRoot *Root, char *Item) 54 { 55 struct TrieTreeNode *Ptr = Root->Tree; 56 int index = 0; 57 unsigned char Charactor; 58 59 while ((Charactor = Item[index]) != '\0') 60 { 61 if (Ptr->Next[Charactor] == NULL) 62 { 63 Ptr->Next[Charactor] = (struct TrieTreeNode *)malloc(sizeof(struct TrieTreeNode)); 64 memset(Ptr->Next[Charactor], '\0', sizeof(struct TrieTreeNode)); 65 Root->NodeCount++; 66 } 67 Ptr = Ptr->Next[Charactor]; 68 index++; 69 } 70 71 Ptr->Accepted = true; 72 } 73 74 /* 75 * 递归序列化 Trie 树 76 * Node : struct TrieTreeNode* 当前操作的 Trie 节点 77 * WritePtr : unsigned char* 特征串写入指针 78 */ 79 unsigned char *DoFeature(struct TrieTreeNode *Node, unsigned char *WritePtr) 80 { 81 int i, count = 0; 82 unsigned char *ErgodicPtr; 83 84 *WritePtr = (unsigned char)Node->Accepted; // 写入节点是否接受 85 WritePtr++; 86 87 ErgodicPtr = WritePtr; // 记录集合起始地址 88 89 for (i = 0; i < NEXTSIZE; i++) // 将该组记录写入特征串 90 { 91 if (Node->Next[i] != NULL) 92 { 93 *WritePtr = (char)i; 94 WritePtr++; 95 count++; 96 } 97 } 98 99 *WritePtr = '\0'; // 写入组分隔符 100 WritePtr++; 101 102 for (i = 0; i < count; i++) // 递归调用处理所有边 103 { 104 WritePtr = DoFeature(Node->Next[ErgodicPtr[i]], WritePtr); 105 } 106 107 return WritePtr; 108 } 109 110 /* 111 * 取得 Trie 的特征串,即序列化 Trie 树 112 * Root : struct TrieTreeRoot* 要操作的 Trie 树根节点 113 * StringLength : int* 长度指针(为了返回二进制串而设置) 114 */ 115 unsigned char *GetFeatureString(struct TrieTreeRoot *Root, int *StringLength) 116 { 117 struct TrieTreeNode *Ptr = Root->Tree; 118 // 假设最坏情况下,每个节点只有一条边,那么存储该节点就需要三个单元(Accepted、边、分隔符) 119 // 但实际上真正用到的只有 3N-1 个字节 120 unsigned char *FeatureString = (unsigned char *)malloc(Root->NodeCount * 3); 121 unsigned char *WritePtr = FeatureString; 122 123 WritePtr = DoFeature(Ptr, WritePtr); 124 125 *StringLength = WritePtr - FeatureString; 126 return FeatureString; 127 } 128 129 void Test_1() 130 { 131 struct TrieTreeRoot *t = BuildTrieTree(); 132 InsertItem(t, "P(\376P)\377"); 133 InsertItem(t, "P(\376)\377"); 134 InsertItem(t, "P(\376P)("); 135 InsertItem(t, "P(\376)("); 136 InsertItem(t, "P\376(P))"); 137 InsertItem(t, "P\376())"); 138 int l = 0, i; 139 unsigned char *s = GetFeatureString(t, &l); 140 printf("Feature: Size=%d, NodeCount=%d\n", l, t->NodeCount); 141 for (i = 0; i < l; i++) 142 { 143 printf("%X ", s[i]); 144 } 145 printf("\n"); 146 } 147 148 void Test_2() 149 { 150 struct TrieTreeRoot *t = BuildTrieTree(); 151 InsertItem(t, "P(\376)("); 152 InsertItem(t, "P(\376P)\377"); 153 InsertItem(t, "P(\376P)("); 154 InsertItem(t, "P(\376(\377"); 155 InsertItem(t, "P(\376P)\377"); 156 InsertItem(t, "P\376())"); 157 InsertItem(t, "P(\376)\377"); 158 InsertItem(t, "P\376(P))"); 159 int l = 0, i; 160 unsigned char *s = GetFeatureString(t, &l); 161 printf("Feature: Size=%d, NodeCount=%d\n", l, t->NodeCount); 162 for (i = 0; i < l; i++) 163 { 164 printf("%X ", s[i]); 165 } 166 printf("\n"); 167 } 168 169 int main(int argc, char **argv) 170 { 171 Test_1(); 172 Test_2(); 173 return 0; 174 }
仍有两个地方可以进行优化:
1、将 next 数组改为指针,有效减少叶子节点占用的空间;
2、如果插入的字符串是固定的,那么可以通过第一遍扫描该组字符串,构建一个大小为256的字典,通过代码 next[dic[charactor]] 进行访问,可有效减少边的数量。