语音识别分类

时间:2022-02-21 20:31:55
# -*- coding:utf-8 -*-


# --------------------------------------------------------- #
#                                                           #
#                Train Language Recognition                 #
#                                                           #
# --------------------------------------------------------- #
#                                                           #
#                  Train Language Recognition               #
#          Copyright(c) iFlytek Corporation, 2018           #
#                    Hefei, Anhui, PRC                      #
#                   http://www.iflytek.com                  #
#                                                           #
# --------------------------------------------------------- #
#  python  : 2.7 version                                    #
#  cuda    : Toolkit 9.1                                    #
#  pytorch : 0.4.0                                          #
# --------------------------------------------------------- #


import os
import time
import codecs
import logging
logger = logging.getLogger(__name__)
logging.basicConfig(level = logging.DEBUG,
                    format = '%(asctime)s[%(levelname)s] ---- %(message)s',
                    )


import torch
import torch.utils.data as Data


from read_data import get_samples, get_data, TorchDataSet
from net_component import LanNet


## ======================================
# 配置文件和参数
# 数据列表
train_list = "./label_train_list_fb.txt"
dev_list   = "./label_dev_list_fb.txt"


# 基本配置参数
use_cuda = torch.cuda.is_available()
if use_cuda: 
    device = torch.device("cuda:2")


# 保存模型地址
model_dir = "/inference/models"
if not os.path.exists(model_dir):
    os.makedirs(model_dir)
# 网络参数
dimension = 40
language_nums = 6
learning_rate = 0.1
batch_size = 64
chunk_num = 10
train_iteration = 10
display_fre = 50
half = 4




## ======================================
train_dataset = TorchDataSet(train_list, batch_size, chunk_num, dimension)
dev_dataset = TorchDataSet(dev_list, batch_size, chunk_num, dimension)
logging.info('finish reading all train data')


# 优化器,SGD更新梯度
train_module = LanNet(input_dim=dimension, hidden_dim=128, bn_dim=30, output_dim=language_nums)
logging.info(train_module)
optimizer = torch.optim.SGD(train_module.parameters(), lr=learning_rate, momentum=0.9)


# 将模型放入GPU中
if use_cuda:
    train_module = train_module.to(device)


for epoch in range(train_iteration):
    if epoch >= half:
        learning_rate /= 2.
        optimizer = torch.optim.SGD(train_module.parameters(), lr=learning_rate, momentum=0.9)


##  train
    train_dataset.reset()
    train_module.train()
    epoch_tic = time.time()
    train_loss = 0.
    train_acc = 0.


    sum_batch_size = 0
    curr_batch_size = 0
    curr_batch_acc = 0
    for step, (batch_x, batch_y) in enumerate(train_dataset): 
        tic = time.time()
        batch_target = batch_y[:,0].contiguous().view(-1, 1).long()
        batch_frames = batch_y[:,1].contiguous().view(-1, 1)


        max_batch_frames = int(max(batch_frames).item())
        batch_train_data = batch_x[:, :max_batch_frames, :]


        step_batch_size = batch_target.size(0)
        batch_mask = torch.zeros(step_batch_size, max_batch_frames)
        for ii in range(step_batch_size):
            frames = int(batch_frames[ii].item())
            batch_mask[ii, :frames] = 1.


        # 将数据放入GPU中
        if use_cuda:
            batch_train_data = batch_train_data.to(device)
            batch_mask       = batch_mask.to(device)
            batch_target     = batch_target.to(device)


        acc, loss = train_module(batch_train_data, batch_mask, batch_target)
        
        # loss = loss.sum()
        backward_loss = loss
        optimizer.zero_grad()
        backward_loss.backward()
        optimizer.step()


        toc = time.time()
        step_time = toc-tic


        train_loss += loss.item()
        train_acc += acc
        curr_batch_acc += acc
        sum_batch_size += 1
        curr_batch_size += 1
        if step % display_fre == 0:
            logging.info('Epoch:%d, Batch:%d, acc:%.6f, loss:%.6f, cost time :%.6fs', epoch, step, curr_batch_acc/curr_batch_size, loss.item(), step_time)
            curr_batch_acc = 0.
            curr_batch_size = 0




    
    modelfile = '%s/model%d.model'%(model_dir, epoch)
    torch.save(train_module.state_dict(), modelfile)
    epoch_toc = time.time()
    epoch_time = epoch_toc-epoch_tic
    logging.info('Epoch:%d, train-acc:%.6f, train-loss:%.6f, cost time :%.6fs', epoch, train_acc/sum_batch_size, train_loss/sum_batch_size, epoch_time)


##  -----------------------------------------------------------------------------------------------------------------------------
##  dev
    train_module.eval()
    epoch_tic = time.time()
    dev_loss = 0.
    dev_acc = 0.
    dev_batch_num = 0 


    for step, (batch_x, batch_y) in enumerate(dev_dataset): 
        tic = time.time()


        batch_target = batch_y[:,0].contiguous().view(-1, 1).long()
        batch_frames = batch_y[:,1].contiguous().view(-1, 1)


        max_batch_frames = int(max(batch_frames).item())
        batch_dev_data = batch_x[:, :max_batch_frames, :]


        step_batch_size = batch_target.size(0)
        batch_mask = torch.zeros(step_batch_size, max_batch_frames)
        for ii in range(step_batch_size):
            frames = int(batch_frames[ii].item())
            batch_mask[ii, :frames] = 1.


        # 将数据放入GPU中
        if use_cuda:
            batch_dev_data   = batch_dev_data.to(device)
            batch_mask       = batch_mask.to(device)
            batch_target     = batch_target.to(device)
            
        with torch.no_grad():
            acc, loss = train_module(batch_dev_data, batch_mask, batch_target)
        
        loss = loss.sum()/step_batch_size


        toc = time.time()
        step_time = toc-tic


        dev_loss += loss.item()
        dev_acc += acc
        dev_batch_num += 1
    
    epoch_toc = time.time()
    epoch_time = epoch_toc-epoch_tic
    acc=dev_acc/dev_batch_num

    logging.info('Epoch:%d, dev-acc:%.6f, dev-loss:%.6f, cost time :%.6fs', epoch, acc, dev_loss/dev_batch_num, epoch_time)





# -*- coding:utf-8 -*-


import codecs
import copy
import random


import torch


from HTKfile import HTKfile


def get_samples(list):
    samples = 0
    max_frames = 0
    with codecs.open(list, 'r', 'utf-8') as file_list:
        for line in file_list:
            line = line.strip()  # 去除结尾换行符
            if not line:  # remove the blank line
                continue
            splited_line = line.split()
            htk_feature = splited_line[0]


            htk_file = HTKfile(htk_feature)
            feature_frames = htk_file.get_frame_num()


            max_frames = max(max_frames, feature_frames)
            samples += 1
    file_list.close()
    return samples, max_frames




def get_data(list, samples, max_frames, dimension):
    data = torch.zeros(samples, max_frames, dimension)
    target_frames = torch.zeros(samples, 2)
    name_list = []
    # 存储数据
    line_num = 0
    with codecs.open(list, 'r', 'utf-8') as file_list:
        for line in file_list:
            line = line.strip()  # 去除结尾换行符
            if not line:  # remove the blank line
                continue
            splited_line = line.split()
            htk_feature = splited_line[0]
            target_label = int(str(splited_line[1]))


            htk_file = HTKfile(htk_feature)
            feature_data = htk_file.read_data()
            file_name = htk_file.get_file_name()
            feature_frames = htk_file.get_frame_num()
            
            curr_feature = torch.Tensor(feature_data)
            means = curr_feature.mean(dim=0, keepdim=True)
            curr_feature_norm = curr_feature - means.expand_as(curr_feature)
            data[line_num,:feature_frames,:] = curr_feature_norm
            target_frames[line_num] = torch.Tensor([target_label, feature_frames])
            name_list.append(file_name)


            line_num += 1
    file_list.close()


    return data, target_frames, name_list


class TorchDataSet(object):
    def __init__(self, file_list, batch_size, chunk_num, dimension):
        self._batch_size = batch_size
        self._chunck_num = chunk_num
        self._chunck_size = self._chunck_num*self._batch_size
        self._dimension = dimension
        self._file_point = codecs.open(file_list, 'r', 'utf-8')
        self._dataset = self._file_point.readlines()
        self._file_point.close()
        random.shuffle(self._dataset)


    def reset(self):
        random.shuffle(self._dataset)
    
    def __iter__(self):
        data_size = len(self._dataset)
        batch_data = []
        target_frames = []
        name_list = []
        max_frames = 0
        for ii in range(data_size):
            line = self._dataset[ii].strip()
            splited_line = line.split()
            htk_feature = splited_line[0]
            target_label = int(str(splited_line[1]))


            htk_file = HTKfile(htk_feature)
            feature_data = htk_file.read_data()
            file_name = htk_file.get_file_name()
            feature_frames = htk_file.get_frame_num()


            if feature_frames > max_frames:
                max_frames = feature_frames
            
            curr_feature = torch.Tensor(feature_data)
            means = curr_feature.mean(dim=0, keepdim=True)
            curr_feature_norm = curr_feature - means.expand_as(curr_feature)
            batch_data.append(curr_feature_norm)
            target_frames.append(torch.Tensor([target_label, feature_frames]))
            name_list.append(file_name)


            if (ii+1) % self._chunck_size == 0:
                chunk_size = len(batch_data)
                idx = 0
                data = torch.zeros(self._batch_size, max_frames, self._dimension)
                target = torch.zeros(self._batch_size, 2)
                for jj in range(chunk_size):
                    curr_data = batch_data[jj]
                    curr_tgt = target_frames[jj]
                    curr_frame = curr_data.size(0)


                    data[idx,:curr_frame,:] = curr_data[:,:]
                    target[idx,:] = curr_tgt[:]
                    idx += 1


                    if idx % self._batch_size == 0:
                        idx = 0
                        yield data, target
                
                max_frames = 0
                batch_data = []
                target_frames = []
                name_list = []
            
            else:
                pass
            


        chunk_size = len(batch_data)
        if chunk_size > self._batch_size: 
            idx = 0
            data = torch.zeros(self._batch_size, max_frames, self._dimension)
            target = torch.zeros(self._batch_size, 2)
            for jj in range(chunk_size):
                curr_data = batch_data[jj]
                curr_tgt = target_frames[jj]
                curr_frame = curr_data.size(0)


                data[idx,:curr_frame,:] = curr_data[:,:]
                target[idx,:] = curr_tgt[:]
                idx += 1


                if idx % self._batch_size == 0:
                    idx = 0
                    yield data, target







# -*- coding:utf-8 -*-


"""a module to read the HTK format file"""


__author__ = 'yfhu3'
__email__ = 'yfhu3@iflytek.com'
__version__ = '2018.01.02 with python 2.7.14'


import numpy as np
import struct
import re


# HTK文件结构:
# 帧数:4字节(第0-第3字节)
# 采样周期:4字节(第4-第7字节)
# 每一帧的字节数:2字节(第8-第9字节)
# 参数类型:2字节(第10-第11字节)
# 数据:N字节(第12字节开始-文件结尾)




class HTKfile(object):


    #
    def __init__(self, path):
        self.__start_frame = 0
        self.__end_frame = 0
        self.__new_path = ''


        if path[-1] == ']':  # 判断输入路径末尾有没有指定帧序号的部分 eg: [2, 56]
            temp_value = re.split(r'[\[,\s\]]', path)
            self.__new_path = temp_value[0]
            self.__start_frame = int(temp_value[-3])
            self.__end_frame = int(temp_value[-2])
        else:
            self.__new_path = path


        self.__input = open(self.__new_path, 'rb')
        #  HTK的数据存储方式是大端存储,需要进行大端到小端的转换
        self.__frame_num = struct.unpack('>I', self.__input.read(4))[0]           # 帧数
        self.__sample_period = struct.unpack('>I', self.__input.read(4))[0]       # 采样周期
        self.__bytes_of_one_frame = struct.unpack('>H', self.__input.read(2))[0]  # 每一帧的字节数
        self.__feature_dim = self.__bytes_of_one_frame / 4                        # dimension of feature
        self.__sample_kind = struct.unpack('>h', self.__input.read(2))[0]         # 参数类型
        temp_value_2 = re.split(r'[/.]', path)
        self.__file_name = temp_value_2[-2]


        if self.__end_frame == 0 or self.__end_frame > self.__frame_num:
            self.__end_frame = self.__frame_num  # 如果尾帧数据在之前未被更改,则其值为帧总数


    def read_data(self):
        curr_data = struct.unpack('>'+'f'*self.__frame_num*self.__feature_dim, self.__input.read(self.__frame_num*self.__bytes_of_one_frame))
        data = np.array(curr_data, dtype= 'float32')
        data = data.reshape(self.__frame_num, self.__feature_dim)


        return data[self.__start_frame: self.__end_frame]


    # 下方是在外界获取变量值的函数
    def get_start_frame(self):
        return self.__start_frame


    def get_end_frame(self):
        return self.__end_frame


    def get_frame_num(self):
        return (self.__end_frame - self.__start_frame)


    def get_sample_period(self):
        return self.__sample_period


    def get_bytes_of_one_frame(self):
        return self.__bytes_of_one_frame


    def get_file_name(self):
        return self.__file_name


    def get_feature_dim(self):
        return self.__feature_dim


    def get_state_label(self):

        return self.__state_label












# -*- coding:utf-8 -*-


import codecs
import copy
import random


import torch


from HTKfile import HTKfile


def get_samples(list):
    samples = 0
    max_frames = 0
    with codecs.open(list, 'r', 'utf-8') as file_list:
        for line in file_list:
            line = line.strip()  # 去除结尾换行符
            if not line:  # remove the blank line
                continue
            splited_line = line.split()
            htk_feature = splited_line[0]


            htk_file = HTKfile(htk_feature)
            feature_frames = htk_file.get_frame_num()


            max_frames = max(max_frames, feature_frames)
            samples += 1
    file_list.close()
    return samples, max_frames




def get_data(list, samples, max_frames, dimension):
    data = torch.zeros(samples, max_frames, dimension)
    target_frames = torch.zeros(samples, 2)
    name_list = []
    # 存储数据
    line_num = 0
    with codecs.open(list, 'r', 'utf-8') as file_list:
        for line in file_list:
            line = line.strip()  # 去除结尾换行符
            if not line:  # remove the blank line
                continue
            splited_line = line.split()
            htk_feature = splited_line[0]
            target_label = int(str(splited_line[1]))


            htk_file = HTKfile(htk_feature)
            feature_data = htk_file.read_data()
            file_name = htk_file.get_file_name()
            feature_frames = htk_file.get_frame_num()
            
            curr_feature = torch.Tensor(feature_data)
            means = curr_feature.mean(dim=0, keepdim=True)
            curr_feature_norm = curr_feature - means.expand_as(curr_feature)
            data[line_num,:feature_frames,:] = curr_feature_norm
            target_frames[line_num] = torch.Tensor([target_label, feature_frames])
            name_list.append(file_name)


            line_num += 1
    file_list.close()


    return data, target_frames, name_list


class TorchDataSet(object):
    def __init__(self, file_list, batch_size, chunk_num, dimension):
        self._batch_size = batch_size
        self._chunck_num = chunk_num
        self._chunck_size = self._chunck_num*self._batch_size
        self._dimension = dimension
        self._file_point = codecs.open(file_list, 'r', 'utf-8')
        self._dataset = self._file_point.readlines()
        self._file_point.close()
        random.shuffle(self._dataset)


    def reset(self):
        random.shuffle(self._dataset)
    
    def __iter__(self):
        data_size = len(self._dataset)
        batch_data = []
        target_frames = []
        name_list = []
        max_frames = 0
        for ii in range(data_size):
            line = self._dataset[ii].strip()
            splited_line = line.split()
            htk_feature = splited_line[0]
            target_label = int(str(splited_line[1]))


            htk_file = HTKfile(htk_feature)
            feature_data = htk_file.read_data()
            file_name = htk_file.get_file_name()
            feature_frames = htk_file.get_frame_num()


            if feature_frames > max_frames:
                max_frames = feature_frames
            
            curr_feature = torch.Tensor(feature_data)
            means = curr_feature.mean(dim=0, keepdim=True)
            curr_feature_norm = curr_feature - means.expand_as(curr_feature)
            batch_data.append(curr_feature_norm)
            target_frames.append(torch.Tensor([target_label, feature_frames]))
            name_list.append(file_name)


            if (ii+1) % self._chunck_size == 0:
                chunk_size = len(batch_data)
                idx = 0
                data = torch.zeros(self._batch_size, max_frames, self._dimension)
                target = torch.zeros(self._batch_size, 2)
                for jj in range(chunk_size):
                    curr_data = batch_data[jj]
                    curr_tgt = target_frames[jj]
                    curr_frame = curr_data.size(0)


                    data[idx,:curr_frame,:] = curr_data[:,:]
                    target[idx,:] = curr_tgt[:]
                    idx += 1


                    if idx % self._batch_size == 0:
                        idx = 0
                        yield data, target
                
                max_frames = 0
                batch_data = []
                target_frames = []
                name_list = []
            
            else:
                pass
            


        chunk_size = len(batch_data)
        if chunk_size > self._batch_size: 
            idx = 0
            data = torch.zeros(self._batch_size, max_frames, self._dimension)
            target = torch.zeros(self._batch_size, 2)
            for jj in range(chunk_size):
                curr_data = batch_data[jj]
                curr_tgt = target_frames[jj]
                curr_frame = curr_data.size(0)


                data[idx,:curr_frame,:] = curr_data[:,:]
                target[idx,:] = curr_tgt[:]
                idx += 1


                if idx % self._batch_size == 0:
                    idx = 0
                    yield data, target