AC自动机1——适用于utf-8编码的Trie树

最近需要用到文本的拼音相似度计算，看了hankcs大神的hanlp里面通过ac自动机实现拼音的存储，想把它转成python版本的。开始啃AC自动机吧。

AC自动机建立在Trie树和KMP字符串匹配算法。首先啃Trie树。

关于Trie树的概念，http://blog.csdn.net/v_july_v/article/details/6897097这一篇讲得很好，还附赠了后缀树。

我所要做的是把utf-8编码的中文词和拼音对应起来。Utf-8编码将一个汉字编码成3个byte，每个byte按照16进制存储。鉴于这种情况，需要构造一个256 Trie，即每一层可能有256个节点。

看了几个程序后，集众人智慧，写了一个自己的。

# coding:utf-8

import sys

reload(sys)
sys.setdefaultencoding("utf-8")

class TrieNode(object):
    def __init__(self):
        self.one_byte = {}
        self.value = None
        self.is_word = False


class Trie256(object):
    def __init__(self):
        self.root = TrieNode()

    def getUtf8String(self, string):
        bytes_array = bytearray(string.encode("utf-8"))
        return bytes_array

    def insert(self, bytes_array, str):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                node.one_byte[byte] = TrieNode()
            node = node.one_byte[byte]
        node.is_word = True
        node.value = str

    def find(self, bytes_array):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                print "No this word in this Trie."
                return None
            node = node.one_byte[byte]
        if not node.is_word:
            print "It is not a word."
            return None
        else:
            return node.value

    def modify(self, bytes_array, str):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                print "This word is not in this Trie, we will insert it."
                node.one_byte[byte] = TrieNode()
            node = node.one_byte[byte]
        if not node.is_word:
            print "This word is not a word in this Trie, we will make it a word."
            node.is_word = True
            node.value = str
        else:
            print "modify this word..."
            node.value = str

    def delete(self, bytes_array):
        node = self.root
        for byte in bytes_array:
            child = node.one_byte.get(byte)
            if child == None:
                print "This word is not in this Trie."
                break
            node = node.one_byte[byte]
        if not node.is_word:
            print "It is not a word."
        else:
            node.is_word = False
            node.value = None
            child = node.one_byte.keys()
            if len(child) == 0:
                node.one_byte.clear()

    def print_item(self, p, indent=0):
        if p:
            ind = '' + '\t' * indent
            for key in p.one_byte.keys():
                label = "'%s' : " % key
                print ind + label + '{'
                self.print_item(p.one_byte[key], indent + 1)
            #print ind + ' ' * len(label) + '}'
            #self.print_item(p.one_byte[key], indent + 1)


if __name__ == "__main__":
    trie = Trie256()

    with open("dictionary/pinyin.txt", 'r') as fd:
        line = fd.readline()
        while line:
            line_split = line.split('=')
            word = line_split[0]
            pinyin = line_split[1].strip()
            bytes = trie.getUtf8String(word)
            sentence = ''
            for byte in bytes:
                sentence = sentence + 'x' + str(byte)
            print sentence
            trie.insert(bytes, pinyin)
            line = fd.readline()

    trie.print_item(trie.root)


    bytes = trie.getUtf8String("一分钟".decode("utf-8"))
    for byte in bytes:
        print byte
    print trie.find(bytes)

秒客网

AC自动机1——适用于utf-8编码的Trie树

相关文章