最近认识了一个做Python语音识别的朋友,聊天时候说到,未来五到十年,Python人工智能会在国内掀起一股狂潮,对各种应用的冲击,不下于淘宝对实体经济的冲击。在本地(江苏某三线城市)做这一行,短期可能显不出效果,但从长远来看,绝对是一个高明的选择。朋友老家山东的,毕业来这里创业,也是十分有想法啊。
将AI课上学习的知识进行简单的整理,可以识别简单的0-9的单个语音。基本方法就是利用库函数提取mfcc,然后计算误差矩阵,再利用动态规划计算累积矩阵。并且限制了匹配路径的范围。具体的技术网上很多,不再细谈。
现有缺点就是输入的语音长度都是1s,如果不固定长度则识别效果变差。改进思路是提取有效语音部分。但是该部分尚未完全做好,只写了一个原形函数,尚未完善。
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
|
import wave
import numpy as np
import matplotlib.pyplot as plt
from python_speech_features import mfcc
from math import cos,sin,sqrt,pi
def read_file(file_name):
with wave. open (file_name, 'r' ) as file :
params = file .getparams()
_, _, framerate, nframes = params[: 4 ]
str_data = file .readframes(nframes)
wave_data = np.fromstring(str_data, dtype = np.short)
time = np.arange( 0 , nframes) * ( 1.0 / framerate)
return wave_data, time
return index1,index2
def find_point(data):
count1,count2 = 0 , 0
for index,val in enumerate (data):
if count1 < 40 :
count1 = count1 + 1 if abs (val)> 0.15 else 0
index1 = index
if count1 = = 40 and count2 < 5 :
count2 = count2 + 1 if abs (val)< 0.001 else 0
index2 = index
if count2 = = 5 : break
return index1,index2
def select_valid(data):
start,end = find_point(normalized(data))
print (start,end)
return data[start:end]
def normalized(a):
maximum = max (a)
minimum = min (a)
return a / maximum
def compute_mfcc_coff(file_prefix = ''):
mfcc_feats = []
s = range ( 10 )
I = [ 0 , 3 , 4 , 8 ]
II = [ 5 , 7 , 9 ]
Input = {' ':s,' I ':I,' II ':II,' B':s}
for index,file_name in enumerate (file_prefix + '{0}.wav' . format (i) for i in Input [file_prefix]):
data,time = read_file(file_name)
#data = select_valid(data)
#if file_prefix=='II':data = select_valid(data)
mfcc_feat = mfcc(data, 48000 )[: 75 ]
mfcc_feats.append(mfcc_feat)
t = np.array(mfcc_feats)
return np.array(mfcc_feats)
def create_dist():
for i,m_i in enumerate (mfcc_coff_input): #get the mfcc of input
for j,m_j in enumerate (mfcc_coff): #get the mfcc of dataset
#build the distortion matrix bwtween i wav and j wav
N = len (mfcc_coff[ 0 ])
distortion_mat = np.array([[ 0 ] * len (m_i) for i in range (N)],dtype = np.double)
for k1,mfcc1 in enumerate (m_i):
for k2,mfcc2 in enumerate (m_j):
distortion_mat[k1][k2] = sqrt( sum ((mfcc1[ 1 :] - mfcc2[ 1 :]) * * 2 ))
yield i,j,distortion_mat
def create_Dist():
for _i,_j,dist in create_dist():
N = len (dist)
Dist = np.array([[ 0 ] * N for i in range (N)],dtype = np.double)
Dist[ 0 ][ 0 ] = dist[ 0 ][ 0 ]
for i in range (N):
for j in range (N):
if i|j = = 0 : continue
pos = [(i - 1 ,j),(i,j - 1 ),(i - 1 ,j - 1 )]
Dist[i][j] = dist[i][j] + min (Dist[k1][k2] for k1,k2 in pos if k1> - 1 and k2> - 1 )
#if _i==0 and _j==1 :print(_i,_j,'\n',Dist,len(Dist[0]),len(Dist[1]))
yield _i,_j,Dist
def search_path(n):
comparison = np.array([[ 0 ] * 10 for i in range (n)],dtype = np.double)
for _i,_j,Dist in create_Dist():
N = len (Dist)
cut_off = 5
row = [(d,N - 1 ,j) for j,d in enumerate (Dist[N - 1 ]) if abs (N - 1 - j)< = cut_off]
col = [(d,i,N - 1 ) for i,d in enumerate (Dist[:,N - 1 ]) if abs (N - 1 - i)< = cut_off]
min_d,min_i,min_j = min (row + col )
comparison[_i][_j] = min_d
optimal_path_x,optimal_path_y = [min_i],[min_j]
while min_i and min_j:
optimal_path_x.append(min_i)
optimal_path_y.append(min_j)
pos = [(min_i - 1 ,min_j),(min_i,min_j - 1 ),(min_i - 1 ,min_j - 1 )]
#try:
min_d,min_i,min_j = min (((Dist[ int (k1)][ int (k2)],k1,k2) for k1,k2 in pos\
if abs (k1 - k2)< = cut_off))
if _i = = _j and _i = = 4 :
plt.scatter(optimal_path_x[:: - 1 ],optimal_path_y[:: - 1 ],color = 'red' )
plt.show()
return comparison
mfcc_coff_input = []
mfcc_coff = []
def match(pre):
global mfcc_coff_input
global mfcc_coff
mfcc_coff_input = compute_mfcc_coff(pre)
compare = np.array([[ 0 ] * 10 for i in range ( len (mfcc_coff_input))],dtype = np.double)
for prefix in [' ',' B']:
mfcc_coff = compute_mfcc_coff(prefix)
compare + = search_path( len (mfcc_coff_input))
for l in compare:
print ([ int (x) for x in l])
print ( min (((val,index) for index,val in enumerate (l)))[ 1 ])
data,time = read_file( '8.wav' )
match( 'I' )
match( 'II' )
|
总结
以上就是本文关于Python实现简单的语音识别系统的全部内容,希望对大家有所帮助。如有不足之处,欢迎留言指出。感谢朋友们对本站的支持!
原文链接:http://blog.csdn.net/pp634077956/article/details/52916699