语音识别系统的第一步是进行特征提取,mfcc是描述短时功率谱包络的一种特征,在语音识别系统中被广泛应用。
一、mel滤波器
每一段语音信号被分为多帧,每帧信号都对应一个频谱(通过fft变换实现),频谱表示频率与信号能量之间的关系。mel滤波器是指多个带通滤波器,在mel频率中带通滤波器的通带是等宽的,但在赫兹(hertz)频谱内mel滤波器在低频处较密集切通带较窄,高频处较稀疏且通带较宽,旨在通过在较低频率处更具辨别性并且在较高频率处较少辨别性来模拟非线性人类耳朵对声音的感知。
赫兹频率和梅尔频率之间的关系为:
假设在梅尔频谱内,有m 个带通滤波器hm (k),0≤m<m,每个带通滤波器的中心频率为f(m) f(m)f(m)每个带通滤波器的传递函数为:
下图为赫兹频率内的mel滤波器,带通滤波器个数为24:
二、mfcc特征
mfcc系数提取步骤:
(1)语音信号分帧处理
(2)每一帧傅里叶变换---->功率谱
(3)将短时功率谱通过mel滤波器
(4)滤波器组系数取对数
(5)将滤波器组系数的对数进行离散余弦变换(dct)
(6)一般将第2到底13个倒谱系数保留作为短时语音信号的特征
python实现
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
|
import wave
import numpy as np
import math
import matplotlib.pyplot as plt
from scipy.fftpack import dct
def read(data_path):
'''读取语音信号
'''
wavepath = data_path
f = wave. open (wavepath, 'rb' )
params = f.getparams()
nchannels,sampwidth,framerate,nframes = params[: 4 ] #声道数、量化位数、采样频率、采样点数
str_data = f.readframes(nframes) #读取音频,字符串格式
f.close()
wavedata = np.fromstring(str_data,dtype = np.short) #将字符串转化为浮点型数据
wavedata = wavedata * 1.0 / ( max ( abs (wavedata))) #wave幅值归一化
return wavedata,nframes,framerate
def enframe(data,win,inc):
'''对语音数据进行分帧处理
input:data(一维array):语音信号
wlen(int):滑动窗长
inc(int):窗口每次移动的长度
output:f(二维array)每次滑动窗内的数据组成的二维array
'''
nx = len (data) #语音信号的长度
try :
nwin = len (win)
except exception as err:
nwin = 1
if nwin = = 1 :
wlen = win
else :
wlen = nwin
nf = int (np.fix((nx - wlen) / inc) + 1 ) #窗口移动的次数
f = np.zeros((nf,wlen)) #初始化二维数组
indf = [inc * j for j in range (nf)]
indf = (np.mat(indf)).t
inds = np.mat( range (wlen))
indf_tile = np.tile(indf,wlen)
inds_tile = np.tile(inds,(nf, 1 ))
mix_tile = indf_tile + inds_tile
f = np.zeros((nf,wlen))
for i in range (nf):
for j in range (wlen):
f[i,j] = data[mix_tile[i,j]]
return f
def point_check(wavedata,win,inc):
'''语音信号端点检测
input:wavedata(一维array):原始语音信号
output:startpoint(int):起始端点
endpoint(int):终止端点
'''
#1.计算短时过零率
frametemp1 = enframe(wavedata[ 0 : - 1 ],win,inc)
frametemp2 = enframe(wavedata[ 1 :],win,inc)
signs = np.sign(np.multiply(frametemp1,frametemp2)) # 计算每一位与其相邻的数据是否异号,异号则过零
signs = list ( map ( lambda x:[[i, 0 ] [i> 0 ] for i in x],signs))
signs = list ( map ( lambda x:[[i, 1 ] [i< 0 ] for i in x], signs))
diffs = np.sign( abs (frametemp1 - frametemp2) - 0.01 )
diffs = list ( map ( lambda x:[[i, 0 ] [i< 0 ] for i in x], diffs))
zcr = list ((np.multiply(signs, diffs)). sum (axis = 1 ))
#2.计算短时能量
amp = list (( abs (enframe(wavedata,win,inc))). sum (axis = 1 ))
# # 设置门限
# print('设置门限')
zcrlow = max ([ round (np.mean(zcr) * 0.1 ), 3 ]) #过零率低门限
zcrhigh = max ([ round ( max (zcr) * 0.1 ), 5 ]) #过零率高门限
amplow = min ([ min (amp) * 10 ,np.mean(amp) * 0.2 , max (amp) * 0.1 ]) #能量低门限
amphigh = max ([ min (amp) * 10 ,np.mean(amp) * 0.2 , max (amp) * 0.1 ]) #能量高门限
# 端点检测
maxsilence = 8 #最长语音间隙时间
minaudio = 16 #最短语音时间
status = 0 #状态0:静音段,1:过渡段,2:语音段,3:结束段
holdtime = 0 #语音持续时间
silencetime = 0 #语音间隙时间
print ( '开始端点检测' )
startpoint = 0
for n in range ( len (zcr)):
if status = = 0 or status = = 1 :
if amp[n] > amphigh or zcr[n] > zcrhigh:
startpoint = n - holdtime
status = 2
holdtime = holdtime + 1
silencetime = 0
elif amp[n] > amplow or zcr[n] > zcrlow:
status = 1
holdtime = holdtime + 1
else :
status = 0
holdtime = 0
elif status = = 2 :
if amp[n] > amplow or zcr[n] > zcrlow:
holdtime = holdtime + 1
else :
silencetime = silencetime + 1
if silencetime < maxsilence:
holdtime = holdtime + 1
elif (holdtime - silencetime) < minaudio:
status = 0
holdtime = 0
silencetime = 0
else :
status = 3
elif status = = 3 :
break
if status = = 3 :
break
holdtime = holdtime - silencetime
endpoint = startpoint + holdtime
return frametemp1[startpoint:endpoint]
def mfcc(framek,framerate,win):
'''提取mfcc参数
input:framek(二维array):二维分帧语音信号
framerate:语音采样频率
win:分帧窗长(fft点数)
output:
'''
#mel滤波器
mel_bank,w2 = mel_filter( 24 ,win,framerate, 0 , 0.5 )
framek = framek.t
#计算功率谱
s = abs (np.fft.fft(framek,axis = 0 )) * * 2
#将功率谱通过滤波器
p = np.dot(mel_bank,s[ 0 :w2,:])
#取对数
logp = np.log(p)
#计算dct系数
# rdct = 12
# cdct = 24
# dctcoef = []
# for i in range(1,rdct+1):
# tmp = [np.cos((2*j+1)*i*math.pi*1.0/(2.0*cdct)) for j in range(cdct)]
# dctcoef.append(tmp)
# #取对数后做余弦变换
# d = np.dot(dctcoef,logp)
num_ceps = 12
d = dct(logp, type = 2 ,axis = 0 ,norm = 'ortho' )[ 1 :(num_ceps + 1 ),:]
return s,mel_bank,p,logp,d
def mel_filter(m,n,fs,l,h):
'''mel滤波器
input:m(int):滤波器个数
n(int):fft点数
fs(int):采样频率
l(float):低频系数
h(float):高频系数
output:melbank(二维array):mel滤波器
'''
fl = fs * l #滤波器范围的最低频率
fh = fs * h #滤波器范围的最高频率
bl = 1125 * np.log( 1 + fl / 700 ) #将频率转换为mel频率
bh = 1125 * np.log( 1 + fh / 700 )
b = bh - bl #频带宽度
y = np.linspace( 0 ,b,m + 2 ) #将mel刻度等间距
print ( 'mel间隔' ,y)
fb = 700 * (np.exp(y / 1125 ) - 1 ) #将mel变为hz
print (fb)
w2 = int (n / 2 + 1 )
df = fs / n
freq = [] #采样频率值
for n in range ( 0 ,w2):
freqs = int (n * df)
freq.append(freqs)
melbank = np.zeros((m,w2))
print (freq)
for k in range ( 1 ,m + 1 ):
f1 = fb[k - 1 ]
f2 = fb[k + 1 ]
f0 = fb[k]
n1 = np.floor(f1 / df)
n2 = np.floor(f2 / df)
n0 = np.floor(f0 / df)
for i in range ( 1 ,w2):
if i > = n1 and i < = n0:
melbank[k - 1 ,i] = (i - n1) / (n0 - n1)
if i > = n0 and i < = n2:
melbank[k - 1 ,i] = (n2 - i) / (n2 - n0)
plt.plot(freq,melbank[k - 1 ,:])
plt.show()
return melbank,w2
if __name__ = = '__main__' :
data_path = 'audio_data.wav'
win = 256
inc = 80
wavedata,nframes,framerate = read(data_path)
framek = point_check(wavedata,win,inc)
s,mel_bank,p,logp,d = mfcc(framek,framerate,win)
|
以上就是本文的全部内容,希望对大家的学习有所帮助,也希望大家多多支持服务器之家。
原文链接:https://blog.csdn.net/Luqiang_Shi/article/details/91049684