trie树-《算法导论》学习笔记十四

时间:2021-04-22 06:55:16

引用一下百度百科的话吧:
Trie树,又称单词查找树,是一种树形结构,是一种哈希树的变种。典型应用是用于统计,排序和保存大量的字符串(但不仅限于字符串),所以经常被搜索引擎系统用于文本词频统计。它的优点是:利用字符串的公共前缀来减少查询时间,最大限度地减少无谓的字符串比较,查询效率比哈希树高。

这里构建了一棵字典树,每个结点有52个孩子指针,对应26个小写字母和26个大写字母,根节点不存储数据,一个单词从第一个字母开始经由根结点走对应分支进行插入和统计。

trie树结点卫星数据包含了字母、出现次数、是否构成一个单词,孩子指针就是一个52大小的trie树结点指针数组。

实现了几个操作:

1. 插入单词

遍历每个字母,从根结点出发,如果结点对应字母的孩子结点为空,就创建结点,出现次数为1,如果存在这个结点,出现次数就+1,并且如果单词结束,结束处的结点是否构成一个单词字段标识为构成

2. 遍历树,并打印所有单词和每个单词出现次数

3. 统计树,按给定的数字统计出现次数前几的单词

树统计,与遍历类似,用尾递归,并传入一个大于单词最大长度的数组来存储每个分支的单词,如果遇到结点能构成一个单词,就判断你单词个数,并以插入排序的方式插入创建的统计链表(类似打扑克的插排序);
统计链表有更新操作,根据输入的统计前几的数字来维护这个链表该去掉哪些结点,该更新哪些结点的顺序等


获取单词来源为编写的一个简单单词随机生成代码,写入一个文件中,可指定单词最大长度,全大写/全小写/大小写均有,单词个数,单词范围(只支持a-*或A-*,例如5,就是生成a-e/A-E的单词)
贴代码:

随机生成单词

#include <stdio.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <time.h>
#include <unistd.h>
#include <errno.h>
#include <string.h>
#include <stdlib.h>

int word_len = 0;
int upper_low = 65;
int lowwer_low = 97;

// if letter_size = 5
// it will generate a-e or A-E letter.
int letter_size = 0;

int random_letter()
{
    return rand() % letter_size;
}
int random_word(
    char *word,
    int opt )
{   
    // minimum word'length is 3.
    int true_word_len = rand() % word_len + 3;
    int true_word_len1 = true_word_len;

    while ( true_word_len-- ) {
        char letter = 0;

        if ( opt == 0 ) {
            letter = random_letter() + lowwer_low;
        } else if ( opt == 1 ) {
            letter = random_letter() + upper_low;
        } else {
            int opt_case = rand() % 2;
            if ( opt_case == 0 )
                letter = random_letter() + lowwer_low;
            else 
                letter = random_letter() + upper_low;
        }
        word[true_word_len] = letter;
    }

    return true_word_len1;
}
void gen_word(
    int fd,
    int word_num,
    int opt )
{
    char word[20] = {0};
    int true_word_len = 0;

    while ( word_num-- ) {
        memset( word, 0, 20);
        true_word_len = random_word( word, opt );
        word[true_word_len] = '\n';
        write( fd, word, true_word_len + 1 );
    }
}
int main(
    int argc,
    char **argv )
{
    srand((int)time(NULL));
    if ( argc != 5 ) {
        printf("please input "
                "word's length & "
                "words' number & "
                "word's range & "
                "gen_case(0:lowwer case,1:upper case,other:both\n");
        exit( 0 );
    }

    word_len = atoi( argv[1] );
    int word_num = atoi( argv[2] );
    letter_size = atoi( argv[3] );
    int opt = atoi( argv[4] );

    int fd = open("word.txt", O_RDWR | O_TRUNC, 0777);

    gen_word( fd, word_num, opt );

    close( fd );

    return 0;
}

trie树

#include <stdio.h>
#include <unistd.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <fcntl.h>
#include <errno.h>

#define MAX_CHILD_NUM 52
#define UPPER_LOW 65
#define UPPER_UP 90
#define LOWER_LOW 97
#define LOWER_UP 122

#define PRINT(format, arg...) \
do { \
    printf("[%s/%d]:", __func__, __LINE__); \
    printf(format, ##arg); \
    printf("\n"); \
}while(0) 

typedef struct trieTreeNode {
    char letter;
    int count;
    int is_word;
    struct trieTreeNode *next[MAX_CHILD_NUM];
} trieTreeNode;

typedef struct trieTree {
    trieTreeNode *root;
} trieTree;

typedef struct count_data {
    int order;
    int count;
    char string[20];
    struct count_data *next;
} count_data;

int trans_letter_2_index(
    char letter )
{
    int index = -1;
    if ( letter >= LOWER_LOW 
        && letter <= LOWER_UP ) {
        index = letter - LOWER_LOW + 26;
    } else if ( letter >= UPPER_LOW 
        && letter <= UPPER_UP ) {
        index = letter - UPPER_LOW; 
    } else {
        PRINT("error letter input:%c", letter);
        exit( 0 );
    }

    return index;


}
trieTreeNode *create_node(
    char letter )
{
    trieTreeNode *node = 
        ( trieTreeNode * )calloc( 1, sizeof(trieTreeNode) );
    node->letter = letter;
    node->count = 0;
    node->is_word = 0;
}
void insert(
    trieTreeNode *root,
    char *word )
{
    if ( root == NULL ) {
        PRINT("root node is null.");
        return;
    }
    int i = 0;
    trieTreeNode *cur = root;

    for ( i; word[i] != '\0'; i++ ) {
        int next_index = trans_letter_2_index(word[i]);
        //PRINT("letter:%c, index:%d", word[i], next_index);
        if ( cur->next[next_index] == NULL ) {
            cur->next[next_index] = create_node( word[i] );
        } else {
            //cur->next[next_index]->count += 1;
        }
        if ( word[i+1] == '\0' ) {
            cur->next[next_index]->count += 1;
            cur->next[next_index]->is_word = 1;
        }
        cur = cur->next[next_index];
    }
}
// 删除链表所有结点
void delete_list_all_node(
    count_data *node )
{
    count_data *p = NULL;
    while ( node ) {
        p = node;
        node = node->next;
        free( p );
    }
}
void print_list_all_node(
    count_data *node )
{
    printf("\n");
    node = node->next;
    while ( node ) {
        printf("[%d],count:%d\tword:%s\n", 
            node->order, node->count, node->string);
        node = node->next;
    }
    printf("\n");
}
void update_insert_node(
    count_data *insert_node )
{
    if ( !insert_node->next )
        return;
    count_data *print_p = insert_node; 

    if ( insert_node->order == 1 ) {
        delete_list_all_node( insert_node->next );
        insert_node->next = NULL;
    } else if ( insert_node->order < 1 ) {
        PRINT("ERROR!!!!!");
        exit( 0 );
    } else {
        count_data *p = insert_node;
        insert_node = insert_node->next;
        while ( insert_node ) {
            if ( insert_node->count < p->count ) {
                insert_node->order = p->order - 1;
            } else if ( insert_node->count > p->count ) {
                PRINT("ERROR!!!cur->count:%d, pre->count:%d", 
                    insert_node->count, p->count);
                exit( 0 );
            } else {
                insert_node->order = p->order;
            }
            if ( insert_node->order < 1 ) {
                delete_list_all_node( insert_node );
                p->next = NULL;
                break;
            }
            p = insert_node;
            insert_node = insert_node->next;
        }
    }
}
void list_insert( 
    char *tmp_word,
    int cur_count,
    int tail,
    count_data *head,
    int top_num )
{
    tmp_word[tail] = '\0';
    count_data *new_data = ( count_data * )malloc( sizeof(count_data) );
    new_data->count = cur_count;
    memcpy( new_data->string, tmp_word, tail + 1 );
    new_data->next = NULL;

    //PRINT("count:%d\ttmp_word:%s, string:%s", cur_count, tmp_word, new_data->string);

    if ( head->next == NULL ) {
        head->next = new_data;
        new_data->order = top_num;
    } else if ( cur_count > head->next->count ) {
        new_data->order = head->next->order;
        new_data->next = head->next;
        head->next = new_data;
        update_insert_node( new_data );
    } else {
        while ( 1 ) {
            head = head->next;
            if ( head->next == NULL ) {
                if ( head->order > 1 ) {
                    head->next = new_data;
                    if ( head->count == new_data->count ) 
                        new_data->order = head->order;
                    else
                        new_data->order = head->order - 1;

                    head->next = new_data;
                } else if ( head->count > new_data->count ) {
                    // 不插入
                    free( new_data );
                } else if ( head->count == new_data->count ) {
                    head->next = new_data;
                    new_data->order = head->order;
                } else if ( head->count < new_data->count ) {
                    // 此种情况只有求出现次数最多的前1个单词时有
                    head->count = new_data->count;
                    free( new_data );
                }
                break;
            } else if ( head->count >= cur_count 
                && head->next->count < cur_count ) {
                new_data->next = head->next;
                head->next = new_data;
                new_data->order = head->order;
                update_insert_node( new_data );
                break;
            }
        }
    }
}
void find_top_count1(
    trieTreeNode *root,
    char *tmp_word,
    int tail,
    count_data *head, 
    int top_num )
{
    if ( !root )
        return;

    tmp_word[tail] = root->letter;
    tail++;

    if ( root->is_word ) {

        /*
        printf("\n--------------before delete------------------\n");
        print_list_all_node( head );
        printf("\n--------------------------------------------\n");
        */

        list_insert( tmp_word, root->count, tail, head, top_num );

        /*
        printf("\n--------------------after delete----------------------------\n");
        print_list_all_node( head );
        printf("\n-----------------------------------------------------------\n");
        */
    }

    int i = 0;
    for ( i; i < MAX_CHILD_NUM; i++ ) {
        find_top_count1( root->next[i], tmp_word, tail, head, top_num );
    }
}
void find_top_count(
    trieTreeNode *root,
    int top_num )
{
    if ( !root )
        return;

    int i = 0;

    count_data *head = ( count_data * )malloc( sizeof(count_data) );

    for ( i; i < MAX_CHILD_NUM; i++ ) {
        char tmp_word[20] = {0};
        find_top_count1( root->next[i], tmp_word, 0, head, top_num );
    }

    printf("出现次数最大前%d次的单词:\n", top_num);
    count_data *p = head->next;
    count_data *free_p = NULL;
    while ( p != NULL ) {
        free_p = p;
        printf("前%d,count:%d\t%s\n", p->order, p->count, p->string);
        p = p->next;
        free( free_p );
    }
    free( head );
}

void tree_walk1(
    trieTreeNode *root,
    char *tmp_word,
    int tail )
{
    if ( !root )
        return;

    tmp_word[tail] = root->letter;
    tail++;
    //printf("%c\n", root->letter);
    if ( root->is_word ) {
        int j = 0;
        printf("count:%d\t", root->count);
        for ( j; j < tail; j++ ) {
            printf("%c", tmp_word[j]);
        }
        printf("\n");
    }

    int i = 0;
    for ( i; i < MAX_CHILD_NUM; i++ ) {
        tree_walk1( root->next[i], tmp_word, tail );
    }
}
void tree_walk(
    trieTreeNode *root )
{
    if ( !root ) 
        return;

    int i = 0;

    for ( i; i < MAX_CHILD_NUM; i++ ) {
        char tmp_word[20] = {0};
        tree_walk1( root->next[i], tmp_word, 0 );
    }
}
int main(
    int argc,
    char **argv )
{
    if ( argc != 3 ) {
        PRINT("USAGE: please input words file & top number");
        exit( 0 );
    }

    char *file_name = argv[1];
    int top_num = atoi( argv[2] );

    trieTree *tree = ( trieTree * )malloc( sizeof(trieTree) );

    tree->root = create_node( -1 );

    int fd = open(file_name, O_RDONLY);
    if ( fd < 0 ) {
        PRINT("OPEN FILE %s ERROR!!!(%s)", file_name, (char *)strerror(errno));
        exit( 0 );
    }
    // 每次读取文件的缓冲区
    char buf[1024 * 10] = {0};

    // 每次读取的大小
    int read_len = 1024;

    // 读取的返回值
    int read_bytes = 0;

    // 从读取的缓冲区每次提取'\n' - '\n'之间的单词
    char tmp_word[20] = {0};

    // 读取文件缓冲区如果出现单词隔断,剩余部分在下一次
    // read才能读到,这个index做单词继续拼接
    int tmp_index = 0;

    while ( 1 ) {
        memset( buf, 0, read_len );
        read_bytes = read( fd, buf, read_len );
        if ( read_bytes <= 0 )
            break;
        //printf("readbytes:%d------\n%s\n", read_bytes, buf);
        int cur = 0;
        while ( cur < read_bytes ) {
            // 单词文件最后一个单词末尾一定要有'\n'
            if ( buf[cur] == '\n' ) {
                tmp_word[tmp_index] = '\0';
                //printf("insert word:%s\n", tmp_word);
                insert( tree->root, tmp_word );
                memset( tmp_word, 0, 20 );
                tmp_index = 0;
            } else {
                tmp_word[tmp_index] = buf[cur];
                tmp_index++;
            }
            cur++;
        }
    }
    printf("\n========================================\n");
    tree_walk( tree->root );

    find_top_count( tree->root, top_num );

    close( fd );

    return 0;
}

trie树的代码使用:./xxx word.txt 10即统计出现次数前10的单词,并打印单词和次数

例如对生成了10000个单词的word.txt文件,统计前5:
./xxx word.txt 5
trie树-《算法导论》学习笔记十四