Python NLP 自然语言处理 隐马尔可夫HMM 维特比Viterbi算法 音字转换实现
import os
import pandas as pd
import numpy as np
from collections import Counter
import json
yl = pd.read_table('', header=None, delim_whitespace=True)
sum = float(np.sum(np.array(yl.iloc[:, 1])))
w_lis = list(yl.iloc[:, 0])
# 读取词频统计、音频统计词典
def read_dict():
f1 = open('', 'r')
f2 = open('', 'r')
pron = json.load(f1)
char = json.load(f2)
return pron, char
# 计算发射概率
def cal_init_prob(sentence, pron):
first = sentence[0]
shoot_prob = dict()
for c in pron[first]:
shoot_prob[c] = (-np.log(float(yl[yl.iloc[:, 0] == c][1]) / sum) if c in w_lis else -np.log(1e-18))
return shoot_prob
#计算转移概率
def cal_trans_prob(sentence, pron, char):
trans_prob = []
for i in range(len(sentence) - 1):
tmp = dict()
for c2 in pron[sentence[i + 1]]: # 遍历下一层汉字集
tmp[c2] = dict()
for c1 in pron[sentence[i]]:
tmp[c2][c1] = (
-np.log(int(yl[yl.iloc[:, 0] == c1 + c2][1]) / sum) if c1 + c2 in w_lis else -np.log(1e-18))
trans_prob.append(tmp)
return trans_prob
# 音字转换
def YintoCharacters(sentence, pron, char):
shoot_prob = cal_init_prob(sentence, pron)
trans_prob = cal_trans_prob(sentence, pron, char)
result = min(shoot_prob, key=lambda x: shoot_prob[x])
tmp = []
for dic in trans_prob:
d = dict()
for k, v in dic.items():
d[k] = min(dic[k].items(), key=lambda x: x[1])
mini = 1e3
mini_c = ''
for k, v in d.items():
if d[k][1] < mini:
mini = d[k][1]
mini_c = k
result += mini_c
tmp.append(d)
return result
# 生成词频、音频统计词典并储存
def prob_cal():
"""
运行前首先需要在当前目录下创建 与 文件
"""
yl = pd.read_table("", encoding='gb2312', header=None).iloc[:, 0]
pron, char = dict(), dict()
for i in range(len(yl)):
tmp = yl.iloc[i].split(' ')
w, pro = tmp[0], tmp[1:]
# 统计每个拼音下每个字的出现频数和每个字下的拼音出现频数
for j, p in enumerate(pro):
if w[j] not in char.keys():
char[w[j]] = [p[:-1]]
else:
char[w[j]].append(p[:-1])
if p[:-1] not in pron.keys():
pron[p[:-1]] = [w[j]]
else:
pron[p[:-1]].append(w[j])
for k, v in pron.items():
pron[k] = dict(Counter(v))
for k, v in char.items():
char[k] = dict(Counter(v))
info_json1 = json.dumps(pron, sort_keys=False, indent=4, separators=(',', ': '))
info_json2 = json.dumps(char, sort_keys=False, indent=4, separators=(',', ': '))
f = open('', 'w')
f.write(info_json1)
f = open('', 'w')
f.write(info_json2)
if __name__ == '__main__':
# 判断当前目录下是否生成了词频、音频统计字典
if os.path.exists("") and os.path.exists(""):
prob_cal()
# 读取字典
pron, char = read_dict()
# 循环读取输入并输出
while 1:
string = input("Please input the pronunciations below: \n"
"(Note that there should not be punctuations and should be splited by blank.)\n").split(' ')
result = YintoCharacters(string, pron, char)
print("[音字转换结果]", result)
print()