最近一年忙于语音识别和声纹识别的创新、写作以及相关论文发表。
并且创建了语音识别解码器微信群,想入群的读者可以留言加微信群。
微信群里不乏国内外著名高校学者、语音研究员、语音识别工程师以及针对端到端语音识别解码的独特性,本人邀请了CV、NLP领域大佬们,期待各位留言入群。
作者微信:zw76859420
好久没写博客了,今天更新一下使用Python提取声学模型的特征,一共三种特征,分别是MFCC、FABNK以及语谱图特征,直接上Python代码。
# -*- coding:utf-8 -*-
# author:zhangwei
"""
该脚本用于提取语音特征,包括MFCC、FBANK以及语谱图特征;
该脚本是对标签数据进行处理;
"""
from python_speech_features import mfcc, delta, logfbank
import wave
import numpy as np
from import fft
def read_wav_data(filename):
'''
获取文件数据以及采样频率;
输入为文件位置,输出为wav文件数学表示和采样频率;
'''
wav = (filename, 'rb')
num_frames = ()
num_channels = ()
framerate = ()
str_data = (num_frames)
()
wave_data = (str_data, dtype=)
wave_data.shape = -1, num_channels
wave_data = wave_data.T
return wave_data, framerate
def get_mfcc_feature(wavsignal, fs):
'''
输入为wav文件数学表示和采样频率,输出为语音的MFCC特征+一阶差分+二阶差分;
'''
feat_mfcc = mfcc(wavsignal, fs)
print(feat_mfcc)
feat_mfcc_d = delta(feat_mfcc, 2)
feat_mfcc_dd = delta(feat_mfcc_d, 2)
wav_feature = np.column_stack((feat_mfcc, feat_mfcc_d, feat_mfcc_dd))
return wav_feature
def get_fbank_feature(wavsignal, fs):
'''
输入为wav文件数学表示和采样频率,输出为语音的FBANK特征+一阶差分+二阶差分;
'''
feat_fbank = logfbank(wavsignal, fs, nfilt=40)
feat_fbank_d = delta(feat_fbank, 2)
feat_fbank_dd = delta(feat_fbank_d, 2)
wav_feature = np.column_stack((feat_fbank, feat_fbank_d, feat_fbank_dd))
return wav_feature
def get_frequency_feature(wavsignal, fs):
'''
输入为wav文件数学表示和采样频率,输出为语谱图特征,特征维度是200;
'''
x = (0, 400 - 1, 400, dtype=np.int64)
w = 0.54 - 0.46 * (2 * * (x) / (400 - 1))
time_window = 25
wav_array = (wavsignal)
wav_length = wav_array.shape[1]
first2end = int(len(wavsignal[0]) / fs * 1000 - time_window) // 10
data_input = (shape=[first2end, 200], dtype=)
for i in range(0, first2end):
p_start = i * 160
p_end = p_start + 400
data_line = wav_array[0, p_start:p_end]
data_line = data_line * w
data_line = (fft(data_line)) / wav_length
data_input[i] = data_line[0: 200]
data_input = (data_input)
return data_input
def get_wav_list(filepath):
'''
读取标签文件,并把标签文件与标签位置进行处理,输入为处理好的标签位置,输出为标签位置列表以及标签位置字典;
'''
list_wav = []
dic_filelist = {}
with open(filepath, 'r') as fr:
lines = ()
for line in lines:
res = ().split(' ')
dic_filelist[res[0]] = res[1]
list_wav.append(res[0])
return dic_filelist , list_wav
def get_wav_text(filename):
'''
输入为文件位置,输出为标签位置以及标签字典('D31_984': ['早稻', '播种', '和', '育秧', '的', '天气', '条件', '有利', '与否', '与', '这', '一', '期间', '的', '日', '平均', '温度', '阴雨', '日数', '密切', '相关'], 'D12_867':);
'''
dic_wav_list = {}
list_text = []
with open(filename ,'r') as fr:
lines = ()
for line in lines:
res = ().split()
list_text.append(res[0])
dic_wav_list[res[0]] = res[1 :]
return dic_wav_list , list_text
if __name__ == '__main__':
# filepath = 'D4_750.wav'
filepath = '/home/zhangwei/PycharmProjects/ASR_MFCC/datalist/'
# wavsignal, fs = read_wav_data(filepath)
# a = get_mfcc_feature(wavsignal , fs)
# b = get_fbank_feature(wavsignal , fs)
# get_frequency_feature(wavsignal , fs)
# get_wav_list(filepath)
get_wav_text(filepath)