Python NLP 自然语言处理 隐马尔可夫HMM 维特比Viterbi算法 音字转换实现

时间:2025-02-19 08:26:56
import os import pandas as pd import numpy as np from collections import Counter import json yl = pd.read_table('', header=None, delim_whitespace=True) sum = float(np.sum(np.array(yl.iloc[:, 1]))) w_lis = list(yl.iloc[:, 0]) # 读取词频统计、音频统计词典 def read_dict(): f1 = open('', 'r') f2 = open('', 'r') pron = json.load(f1) char = json.load(f2) return pron, char # 计算发射概率 def cal_init_prob(sentence, pron): first = sentence[0] shoot_prob = dict() for c in pron[first]: shoot_prob[c] = (-np.log(float(yl[yl.iloc[:, 0] == c][1]) / sum) if c in w_lis else -np.log(1e-18)) return shoot_prob #计算转移概率 def cal_trans_prob(sentence, pron, char): trans_prob = [] for i in range(len(sentence) - 1): tmp = dict() for c2 in pron[sentence[i + 1]]: # 遍历下一层汉字集 tmp[c2] = dict() for c1 in pron[sentence[i]]: tmp[c2][c1] = ( -np.log(int(yl[yl.iloc[:, 0] == c1 + c2][1]) / sum) if c1 + c2 in w_lis else -np.log(1e-18)) trans_prob.append(tmp) return trans_prob # 音字转换 def YintoCharacters(sentence, pron, char): shoot_prob = cal_init_prob(sentence, pron) trans_prob = cal_trans_prob(sentence, pron, char) result = min(shoot_prob, key=lambda x: shoot_prob[x]) tmp = [] for dic in trans_prob: d = dict() for k, v in dic.items(): d[k] = min(dic[k].items(), key=lambda x: x[1]) mini = 1e3 mini_c = '' for k, v in d.items(): if d[k][1] < mini: mini = d[k][1] mini_c = k result += mini_c tmp.append(d) return result # 生成词频、音频统计词典并储存 def prob_cal(): """ 运行前首先需要在当前目录下创建 与 文件 """ yl = pd.read_table("", encoding='gb2312', header=None).iloc[:, 0] pron, char = dict(), dict() for i in range(len(yl)): tmp = yl.iloc[i].split(' ') w, pro = tmp[0], tmp[1:] # 统计每个拼音下每个字的出现频数和每个字下的拼音出现频数 for j, p in enumerate(pro): if w[j] not in char.keys(): char[w[j]] = [p[:-1]] else: char[w[j]].append(p[:-1]) if p[:-1] not in pron.keys(): pron[p[:-1]] = [w[j]] else: pron[p[:-1]].append(w[j]) for k, v in pron.items(): pron[k] = dict(Counter(v)) for k, v in char.items(): char[k] = dict(Counter(v)) info_json1 = json.dumps(pron, sort_keys=False, indent=4, separators=(',', ': ')) info_json2 = json.dumps(char, sort_keys=False, indent=4, separators=(',', ': ')) f = open('', 'w') f.write(info_json1) f = open('', 'w') f.write(info_json2) if __name__ == '__main__': # 判断当前目录下是否生成了词频、音频统计字典 if os.path.exists("") and os.path.exists(""): prob_cal() # 读取字典 pron, char = read_dict() # 循环读取输入并输出 while 1: string = input("Please input the pronunciations below: \n" "(Note that there should not be punctuations and should be splited by blank.)\n").split(' ') result = YintoCharacters(string, pron, char) print("[音字转换结果]", result) print()