语音识别分类

# -*- coding:utf-8 -*-

# --------------------------------------------------------- #
# #
# Train Language Recognition #
# #
# --------------------------------------------------------- #
# #
# Train Language Recognition #
# Copyright(c) iFlytek Corporation, 2018 #
# Hefei, Anhui, PRC #
# http://www.iflytek.com #
# #
# --------------------------------------------------------- #
# python : 2.7 version #
# cuda : Toolkit 9.1 #
# pytorch : 0.4.0 #
# --------------------------------------------------------- #

import os
import time
import codecs
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level = logging.DEBUG,
format = '%(asctime)s[%(levelname)s] ---- %(message)s',
)

import torch
import torch.utils.data as Data

from read_data import get_samples, get_data, TorchDataSet
from net_component import LanNet

## ======================================
# 配置文件和参数
# 数据列表
train_list = "./label_train_list_fb.txt"
dev_list = "./label_dev_list_fb.txt"

# 基本配置参数
use_cuda = torch.cuda.is_available()
if use_cuda:
device = torch.device("cuda:2")

# 保存模型地址
model_dir = "/inference/models"
if not os.path.exists(model_dir):
os.makedirs(model_dir)
# 网络参数
dimension = 40
language_nums = 6
learning_rate = 0.1
batch_size = 64
chunk_num = 10
train_iteration = 10
display_fre = 50
half = 4

## ======================================
train_dataset = TorchDataSet(train_list, batch_size, chunk_num, dimension)
dev_dataset = TorchDataSet(dev_list, batch_size, chunk_num, dimension)
logging.info('finish reading all train data')

# 优化器，SGD更新梯度
train_module = LanNet(input_dim=dimension, hidden_dim=128, bn_dim=30, output_dim=language_nums)
logging.info(train_module)
optimizer = torch.optim.SGD(train_module.parameters(), lr=learning_rate, momentum=0.9)

# 将模型放入GPU中
if use_cuda:
train_module = train_module.to(device)

for epoch in range(train_iteration):
if epoch >= half:
learning_rate /= 2.
optimizer = torch.optim.SGD(train_module.parameters(), lr=learning_rate, momentum=0.9)

## train
train_dataset.reset()
train_module.train()
epoch_tic = time.time()
train_loss = 0.
train_acc = 0.

sum_batch_size = 0
curr_batch_size = 0
curr_batch_acc = 0
for step, (batch_x, batch_y) in enumerate(train_dataset):
tic = time.time()
batch_target = batch_y[:,0].contiguous().view(-1, 1).long()
batch_frames = batch_y[:,1].contiguous().view(-1, 1)

max_batch_frames = int(max(batch_frames).item())
batch_train_data = batch_x[:, :max_batch_frames, :]

step_batch_size = batch_target.size(0)
batch_mask = torch.zeros(step_batch_size, max_batch_frames)
for ii in range(step_batch_size):
frames = int(batch_frames[ii].item())
batch_mask[ii, :frames] = 1.

# 将数据放入GPU中
if use_cuda:
batch_train_data = batch_train_data.to(device)
batch_mask = batch_mask.to(device)
batch_target = batch_target.to(device)

acc, loss = train_module(batch_train_data, batch_mask, batch_target)

# loss = loss.sum()
backward_loss = loss
optimizer.zero_grad()
backward_loss.backward()
optimizer.step()

toc = time.time()
step_time = toc-tic

train_loss += loss.item()
train_acc += acc
curr_batch_acc += acc
sum_batch_size += 1
curr_batch_size += 1
if step % display_fre == 0:
logging.info('Epoch:%d, Batch:%d, acc:%.6f, loss:%.6f, cost time :%.6fs', epoch, step, curr_batch_acc/curr_batch_size, loss.item(), step_time)
curr_batch_acc = 0.
curr_batch_size = 0


modelfile = '%s/model%d.model'%(model_dir, epoch)
torch.save(train_module.state_dict(), modelfile)
epoch_toc = time.time()
epoch_time = epoch_toc-epoch_tic
logging.info('Epoch:%d, train-acc:%.6f, train-loss:%.6f, cost time :%.6fs', epoch, train_acc/sum_batch_size, train_loss/sum_batch_size, epoch_time)

## -----------------------------------------------------------------------------------------------------------------------------
## dev
train_module.eval()
epoch_tic = time.time()
dev_loss = 0.
dev_acc = 0.
dev_batch_num = 0

for step, (batch_x, batch_y) in enumerate(dev_dataset):
tic = time.time()

batch_target = batch_y[:,0].contiguous().view(-1, 1).long()
batch_frames = batch_y[:,1].contiguous().view(-1, 1)

max_batch_frames = int(max(batch_frames).item())
batch_dev_data = batch_x[:, :max_batch_frames, :]

step_batch_size = batch_target.size(0)
batch_mask = torch.zeros(step_batch_size, max_batch_frames)
for ii in range(step_batch_size):
frames = int(batch_frames[ii].item())
batch_mask[ii, :frames] = 1.

# 将数据放入GPU中
if use_cuda:
batch_dev_data = batch_dev_data.to(device)
batch_mask = batch_mask.to(device)
batch_target = batch_target.to(device)

with torch.no_grad():
acc, loss = train_module(batch_dev_data, batch_mask, batch_target)

loss = loss.sum()/step_batch_size

toc = time.time()
step_time = toc-tic

dev_loss += loss.item()
dev_acc += acc
dev_batch_num += 1

epoch_toc = time.time()
epoch_time = epoch_toc-epoch_tic
acc=dev_acc/dev_batch_num

logging.info('Epoch:%d, dev-acc:%.6f, dev-loss:%.6f, cost time :%.6fs', epoch, acc, dev_loss/dev_batch_num, epoch_time)

# -*- coding:utf-8 -*-

import codecs
import copy
import random

import torch

from HTKfile import HTKfile

def get_samples(list):
samples = 0
max_frames = 0
with codecs.open(list, 'r', 'utf-8') as file_list:
for line in file_list:
line = line.strip() # 去除结尾换行符
if not line: # remove the blank line
continue
splited_line = line.split()
htk_feature = splited_line[0]

htk_file = HTKfile(htk_feature)
feature_frames = htk_file.get_frame_num()

max_frames = max(max_frames, feature_frames)
samples += 1
file_list.close()
return samples, max_frames

def get_data(list, samples, max_frames, dimension):
data = torch.zeros(samples, max_frames, dimension)
target_frames = torch.zeros(samples, 2)
name_list = []
# 存储数据
line_num = 0
with codecs.open(list, 'r', 'utf-8') as file_list:
for line in file_list:
line = line.strip() # 去除结尾换行符
if not line: # remove the blank line
continue
splited_line = line.split()
htk_feature = splited_line[0]
target_label = int(str(splited_line[1]))

htk_file = HTKfile(htk_feature)
feature_data = htk_file.read_data()
file_name = htk_file.get_file_name()
feature_frames = htk_file.get_frame_num()

curr_feature = torch.Tensor(feature_data)
means = curr_feature.mean(dim=0, keepdim=True)
curr_feature_norm = curr_feature - means.expand_as(curr_feature)
data[line_num,:feature_frames,:] = curr_feature_norm
target_frames[line_num] = torch.Tensor([target_label, feature_frames])
name_list.append(file_name)

line_num += 1
file_list.close()

return data, target_frames, name_list

class TorchDataSet(object):
def __init__(self, file_list, batch_size, chunk_num, dimension):
self._batch_size = batch_size
self._chunck_num = chunk_num
self._chunck_size = self._chunck_num*self._batch_size
self._dimension = dimension
self._file_point = codecs.open(file_list, 'r', 'utf-8')
self._dataset = self._file_point.readlines()
self._file_point.close()
random.shuffle(self._dataset)

def reset(self):
random.shuffle(self._dataset)

def __iter__(self):
data_size = len(self._dataset)
batch_data = []
target_frames = []
name_list = []
max_frames = 0
for ii in range(data_size):
line = self._dataset[ii].strip()
splited_line = line.split()
htk_feature = splited_line[0]
target_label = int(str(splited_line[1]))

htk_file = HTKfile(htk_feature)
feature_data = htk_file.read_data()
file_name = htk_file.get_file_name()
feature_frames = htk_file.get_frame_num()

if feature_frames > max_frames:
max_frames = feature_frames

curr_feature = torch.Tensor(feature_data)
means = curr_feature.mean(dim=0, keepdim=True)
curr_feature_norm = curr_feature - means.expand_as(curr_feature)
batch_data.append(curr_feature_norm)
target_frames.append(torch.Tensor([target_label, feature_frames]))
name_list.append(file_name)

if (ii+1) % self._chunck_size == 0:
chunk_size = len(batch_data)
idx = 0
data = torch.zeros(self._batch_size, max_frames, self._dimension)
target = torch.zeros(self._batch_size, 2)
for jj in range(chunk_size):
curr_data = batch_data[jj]
curr_tgt = target_frames[jj]
curr_frame = curr_data.size(0)

data[idx,:curr_frame,:] = curr_data[:,:]
target[idx,:] = curr_tgt[:]
idx += 1

if idx % self._batch_size == 0:
idx = 0
yield data, target

max_frames = 0
batch_data = []
target_frames = []
name_list = []

else:
pass


chunk_size = len(batch_data)
if chunk_size > self._batch_size:
idx = 0
data = torch.zeros(self._batch_size, max_frames, self._dimension)
target = torch.zeros(self._batch_size, 2)
for jj in range(chunk_size):
curr_data = batch_data[jj]
curr_tgt = target_frames[jj]
curr_frame = curr_data.size(0)

data[idx,:curr_frame,:] = curr_data[:,:]
target[idx,:] = curr_tgt[:]
idx += 1

if idx % self._batch_size == 0:
idx = 0
yield data, target

# -*- coding:utf-8 -*-

"""a module to read the HTK format file"""

__author__ = 'yfhu3'
__email__ = 'yfhu3@iflytek.com'
__version__ = '2018.01.02 with python 2.7.14'

import numpy as np
import struct
import re

# HTK文件结构：
# 帧数：4字节（第0-第3字节）
# 采样周期：4字节（第4-第7字节）
# 每一帧的字节数：2字节（第8-第9字节）
# 参数类型：2字节（第10-第11字节）
# 数据：N字节（第12字节开始-文件结尾）

class HTKfile(object):

#
def __init__(self, path):
self.__start_frame = 0
self.__end_frame = 0
self.__new_path = ''

if path[-1] == ']': # 判断输入路径末尾有没有指定帧序号的部分 eg: [2, 56]
temp_value = re.split(r'[\[,\s\]]', path)
self.__new_path = temp_value[0]
self.__start_frame = int(temp_value[-3])
self.__end_frame = int(temp_value[-2])
else:
self.__new_path = path

self.__input = open(self.__new_path, 'rb')
# HTK的数据存储方式是大端存储，需要进行大端到小端的转换
self.__frame_num = struct.unpack('>I', self.__input.read(4))[0] # 帧数
self.__sample_period = struct.unpack('>I', self.__input.read(4))[0] # 采样周期
self.__bytes_of_one_frame = struct.unpack('>H', self.__input.read(2))[0] # 每一帧的字节数
self.__feature_dim = self.__bytes_of_one_frame / 4 # dimension of feature
self.__sample_kind = struct.unpack('>h', self.__input.read(2))[0] # 参数类型
temp_value_2 = re.split(r'[/.]', path)
self.__file_name = temp_value_2[-2]

if self.__end_frame == 0 or self.__end_frame > self.__frame_num:
self.__end_frame = self.__frame_num # 如果尾帧数据在之前未被更改，则其值为帧总数

def read_data(self):
curr_data = struct.unpack('>'+'f'*self.__frame_num*self.__feature_dim, self.__input.read(self.__frame_num*self.__bytes_of_one_frame))
data = np.array(curr_data, dtype= 'float32')
data = data.reshape(self.__frame_num, self.__feature_dim)

return data[self.__start_frame: self.__end_frame]

# 下方是在外界获取变量值的函数
def get_start_frame(self):
return self.__start_frame

def get_end_frame(self):
return self.__end_frame

def get_frame_num(self):
return (self.__end_frame - self.__start_frame)

def get_sample_period(self):
return self.__sample_period

def get_bytes_of_one_frame(self):
return self.__bytes_of_one_frame

def get_file_name(self):
return self.__file_name

def get_feature_dim(self):
return self.__feature_dim

def get_state_label(self):

return self.__state_label

秒客网

语音识别分类

相关文章