编码器-解码器结构
class EncoderDecoder():
def __init__(self, encoder, decoder, source_embedding, target_embedding, generator):
"""初始化函数中有5个参数, 分别是编码器对象, 解码器对象,
源数据嵌入函数, 目标数据嵌入函数, 以及输出部分的类别生成器对象
"""
super(EncoderDecoder, self).__init__()
# 将参数传入到类中
= encoder
= decoder
self.source_embedding = source_embedding
self.target_embedding = target_embedding
= generator
def forward(self, source_input, target_input, source_mask, target_mask):
"""在forward函数中,有四个参数, source_input代表源数据, target_input代表目标数据,
source_mask和target_mask代表对应的掩码张量"""
# 在函数中, 将source_input, source_mask传入编码函数, 得到结果后,
# 与source_mask,target_input,和target_mask一同传给解码函数.
return ((source_input, source_mask), source_mask,
target_input, target_mask)
def encode(self, source_input, source_mask):
"""编码函数, 以source_input和source_mask为参数"""
# 使用source_embedding对source_input做处理, 然后和source_mask一起传给
return (self.source_embedding(source_input), source_mask)
def decode(self, encoder_output, source_mask, target_input, target_mask):
"""解码函数, 以memory即编码器的输出, source_mask, target_input, target_mask为参数"""
# 使用target_embedding对target_input做处理, 然后和source_mask, target_mask, encoder_output一起传给
return (self.target_embedding(target_input), encoder_output, source_mask, target_mask)
if __name__ == "__main__":
# 设置参数
test_embedding_dim = 512
test_vocab_size = 10000
test_max_len = 100
test_heads = 8
test_dropout = 0.2
d_ffl = 64
size = d_model = test_embedding_dim
# 假设源数据与目标数据相同, 实际中并不相同
source = target = test_input_tensor = ([[1, 2, 3, 4], [4, 3, 2, 1]])
# 多头注意力机制计算(前一个是多头自注意力,后一个就是多头注意力)
# 原文本的掩码(source_mask) 和 目标文本的掩码(target_mask) 实际中可能不同,这里为了方便计算使它们相同
test_mask = (8, 4, 4)
src_mask = tar_mask = test_mask
self_mha = mha = MultiHeadedAttention(test_heads, test_embedding_dim, test_dropout)
# 前馈全连接层
ffl = FeedForwardLayer(d_model, d_ffl, test_dropout)
# 编码器层和解码器层数
test_num_layers = 4
# 编码器层
el = EncoderLayer(size, deepcopy(self_mha), deepcopy(ffl), test_dropout)
# 编码器
encoder = TransformerEncoder(el, test_num_layers)
# 解码器层
dl = DecoderLayer(test_embedding_dim, deepcopy(self_mha), deepcopy(mha), deepcopy(ffl), test_dropout)
# 解码器
decoder = TransformerDecoder(dl, test_num_layers)
# 输出部分
output = TransformerOutput(test_embedding_dim, test_vocab_size)
# 编码器-解码器
source_embed = (test_embedding_dim, d_model)
target_embed = (test_embedding_dim, d_model)
ed = EncoderDecoder(encoder, decoder, source_embed, target_embed, output)
ed_result = ed(source, target, src_mask, tar_mask)
print(ed_result)
print(ed_result.shape)
tensor([[[-1.5362, 0.6945, 0.1928, ..., -0.1635, 0.9268, 0.9474],
[-2.1193, 0.9950, -0.2294, ..., -0.8179, 1.5066, 1.3784],
[-0.8416, 0.9558, 0.1298, ..., 1.1093, 0.8565, -0.2909],
[-0.6144, 0.5424, -0.0701, ..., -0.8175, 0.9698, 0.0310]],[[-0.7840, 0.1226, -0.1851, ..., -0.8425, 1.4955, 0.6446],
[-0.3039, 0.5960, 0.1360, ..., 0.8229, 1.3549, -0.6942],
[-2.0222, 0.6236, -0.5268, ..., -1.3863, 1.0146, 1.1675],
[-1.9935, 0.2078, 0.9256, ..., -1.0024, 1.0066, 1.0787]]],
grad_fn=<AddBackward0>)
([2, 4, 512])
Transformer模型构建函数
def build_transformer_model(source_vocab_size, target_vocab_size, num_layers=4,
model_dim=512, feedforward_dim=2048, num_heads=8, dropout=0.1):
"""该函数用来构建模型, 有7个参数,分别是源数据词汇总数,目标数据词汇总数,
编码器和解码器堆叠数,词向量维度,前馈全连接网络中变换矩阵的维度,
多头注意力结构中的多头数,以及置零比率dropout."""
# 实例化多头注意力类
self_mha_attention = mha_attention = MultiHeadedAttention(num_heads, model_dim)
# 实例化前馈全连接层
feedforward = FeedForwardLayer(model_dim, feedforward_dim, dropout)
# 实例化位置编码类
positional_encoding = PositionalEncoding(model_dim, dropout)
# 最外层是EncoderDecoder, 在EncoderDecoder中,
# 分别是编码器层,解码器层,源数据文本嵌入层和位置编码组成的有序结构,
# 目标数据Embedding层和位置编码组成的有序结构,以及类别生成器层.
# 在编码器层中有注意力子层以及前馈全连接子层,
# 在解码器层中有两个注意力子层以及前馈全连接层.
model = EncoderDecoder(
TransformerEncoder(EncoderLayer(model_dim, deepcopy(mha_attention), deepcopy(feedforward), dropout),
num_layers),
TransformerDecoder(DecoderLayer(model_dim, deepcopy(self_mha_attention), deepcopy(mha_attention),
deepcopy(feedforward), dropout), num_layers),
(TextEmbeddings(source_vocab_size, model_dim), deepcopy(positional_encoding)),
(TextEmbeddings(target_vocab_size, model_dim), deepcopy(positional_encoding)),
TransformerOutput(model_dim, target_vocab_size))
# 模型结构完成后,接下来就是初始化模型中的参数,比如线性层中的变换矩阵
# 这里一旦判断参数的维度大于1,则会将其初始化成一个服从均匀分布的矩阵.
for param in ():
if () > 1:
.xavier_uniform_(param)
return model
if __name__ == "__main__":
source_vocab = 12
target_vocab = 12
test_num_layers = 4
result = build_transformer_model(source_vocab, target_vocab, test_num_layers)
print(result)
EncoderDecoder(
(encoder): TransformerEncoder(
(encoder_layers): ModuleList(
(0-3): 4 x EncoderLayer(
(self_attention): MultiHeadedAttention(
(linears): ModuleList(
(0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
)
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward): FeedForwardLayer(
(linear1): Linear(in_features=512, out_features=2048, bias=True)
(linear2): Linear(in_features=2048, out_features=512, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(sublayers): ModuleList(
(0-1): 2 x SublayerConnectionWithNormalization(
(norm): NormalizationLayer()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(norm_layer): NormalizationLayer()
)
(decoder): TransformerDecoder(
(layers): ModuleList(
(0-3): 4 x DecoderLayer(
(self_mha_attention): MultiHeadedAttention(
(linears): ModuleList(
(0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
)
(dropout): Dropout(p=0.1, inplace=False)
)
(mha_attention): MultiHeadedAttention(
(linears): ModuleList(
(0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
)
(dropout): Dropout(p=0.1, inplace=False)
)
(feed_forward): FeedForwardLayer(
(linear1): Linear(in_features=512, out_features=2048, bias=True)
(linear2): Linear(in_features=2048, out_features=512, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(sublayers): ModuleList(
(0-2): 3 x SublayerConnectionWithNormalization(
(norm): NormalizationLayer()
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(norm): NormalizationLayer()
)
(source_embedding): Sequential(
(0): TextEmbeddings(
(embedding_layer): Embedding(12, 512)
)
(1): PositionalEncoding(
(dropout): Dropout(p=0.1, inplace=False)
)
)
(target_embedding): Sequential(
(0): TextEmbeddings(
(embedding_layer): Embedding(12, 512)
)
(1): PositionalEncoding(
(dropout): Dropout(p=0.1, inplace=False)
)
)
(generator): TransformerOutput(
(linear): Linear(in_features=512, out_features=12, bias=True)
)
)