Transformer模型整体构建的实现

时间:2025-01-19 09:04:39

编码器-解码器结构

class EncoderDecoder():
    def __init__(self, encoder, decoder, source_embedding, target_embedding, generator):
        """初始化函数中有5个参数, 分别是编码器对象, 解码器对象,
           源数据嵌入函数, 目标数据嵌入函数,  以及输出部分的类别生成器对象
        """
        super(EncoderDecoder, self).__init__()
        # 将参数传入到类中
         = encoder
         = decoder
        self.source_embedding = source_embedding
        self.target_embedding = target_embedding
         = generator

    def forward(self, source_input, target_input, source_mask, target_mask):
        """在forward函数中,有四个参数, source_input代表源数据, target_input代表目标数据,
           source_mask和target_mask代表对应的掩码张量"""

        # 在函数中, 将source_input, source_mask传入编码函数, 得到结果后,
        # 与source_mask,target_input,和target_mask一同传给解码函数.
        return ((source_input, source_mask), source_mask,
                           target_input, target_mask)

    def encode(self, source_input, source_mask):
        """编码函数, 以source_input和source_mask为参数"""
        # 使用source_embedding对source_input做处理, 然后和source_mask一起传给
        return (self.source_embedding(source_input), source_mask)

    def decode(self, encoder_output, source_mask, target_input, target_mask):
        """解码函数, 以memory即编码器的输出, source_mask, target_input, target_mask为参数"""
        # 使用target_embedding对target_input做处理, 然后和source_mask, target_mask, encoder_output一起传给
        return (self.target_embedding(target_input), encoder_output, source_mask, target_mask)


if __name__ == "__main__":
    # 设置参数
    test_embedding_dim = 512
    test_vocab_size = 10000
    test_max_len = 100
    test_heads = 8
    test_dropout = 0.2
    d_ffl = 64
    size = d_model = test_embedding_dim

    # 假设源数据与目标数据相同, 实际中并不相同
    source = target = test_input_tensor = ([[1, 2, 3, 4], [4, 3, 2, 1]])

    # 多头注意力机制计算(前一个是多头自注意力,后一个就是多头注意力)
    # 原文本的掩码(source_mask) 和 目标文本的掩码(target_mask) 实际中可能不同,这里为了方便计算使它们相同
    test_mask = (8, 4, 4)
    src_mask = tar_mask = test_mask
    self_mha = mha = MultiHeadedAttention(test_heads, test_embedding_dim, test_dropout)

    # 前馈全连接层
    ffl = FeedForwardLayer(d_model, d_ffl, test_dropout)

    # 编码器层和解码器层数
    test_num_layers = 4
    # 编码器层
    el = EncoderLayer(size, deepcopy(self_mha), deepcopy(ffl), test_dropout)
    # 编码器
    encoder = TransformerEncoder(el, test_num_layers)
    # 解码器层
    dl = DecoderLayer(test_embedding_dim, deepcopy(self_mha), deepcopy(mha), deepcopy(ffl), test_dropout)
    # 解码器
    decoder = TransformerDecoder(dl, test_num_layers)
    # 输出部分
    output = TransformerOutput(test_embedding_dim, test_vocab_size)

    # 编码器-解码器
    source_embed = (test_embedding_dim, d_model)
    target_embed = (test_embedding_dim, d_model)
    ed = EncoderDecoder(encoder, decoder, source_embed, target_embed, output)
    ed_result = ed(source, target, src_mask, tar_mask)
    print(ed_result)
    print(ed_result.shape)

 

tensor([[[-1.5362,  0.6945,  0.1928,  ..., -0.1635,  0.9268,  0.9474],
         [-2.1193,  0.9950, -0.2294,  ..., -0.8179,  1.5066,  1.3784],
         [-0.8416,  0.9558,  0.1298,  ...,  1.1093,  0.8565, -0.2909],
         [-0.6144,  0.5424, -0.0701,  ..., -0.8175,  0.9698,  0.0310]],

        [[-0.7840,  0.1226, -0.1851,  ..., -0.8425,  1.4955,  0.6446],
         [-0.3039,  0.5960,  0.1360,  ...,  0.8229,  1.3549, -0.6942],
         [-2.0222,  0.6236, -0.5268,  ..., -1.3863,  1.0146,  1.1675],
         [-1.9935,  0.2078,  0.9256,  ..., -1.0024,  1.0066,  1.0787]]],
       grad_fn=<AddBackward0>)
([2, 4, 512])

Transformer模型构建函数


def build_transformer_model(source_vocab_size, target_vocab_size, num_layers=4,
                            model_dim=512, feedforward_dim=2048, num_heads=8, dropout=0.1):
    """该函数用来构建模型, 有7个参数,分别是源数据词汇总数,目标数据词汇总数,
       编码器和解码器堆叠数,词向量维度,前馈全连接网络中变换矩阵的维度,
       多头注意力结构中的多头数,以及置零比率dropout."""

    # 实例化多头注意力类
    self_mha_attention = mha_attention = MultiHeadedAttention(num_heads, model_dim)

    # 实例化前馈全连接层
    feedforward = FeedForwardLayer(model_dim, feedforward_dim, dropout)

    # 实例化位置编码类
    positional_encoding = PositionalEncoding(model_dim, dropout)

    # 最外层是EncoderDecoder, 在EncoderDecoder中,
    # 分别是编码器层,解码器层,源数据文本嵌入层和位置编码组成的有序结构,
    # 目标数据Embedding层和位置编码组成的有序结构,以及类别生成器层.
    # 在编码器层中有注意力子层以及前馈全连接子层,
    # 在解码器层中有两个注意力子层以及前馈全连接层.
    model = EncoderDecoder(
        TransformerEncoder(EncoderLayer(model_dim, deepcopy(mha_attention), deepcopy(feedforward), dropout),
                           num_layers),
        TransformerDecoder(DecoderLayer(model_dim, deepcopy(self_mha_attention), deepcopy(mha_attention),
                                        deepcopy(feedforward), dropout), num_layers),
        (TextEmbeddings(source_vocab_size, model_dim), deepcopy(positional_encoding)),
        (TextEmbeddings(target_vocab_size, model_dim), deepcopy(positional_encoding)),
        TransformerOutput(model_dim, target_vocab_size))

    # 模型结构完成后,接下来就是初始化模型中的参数,比如线性层中的变换矩阵
    # 这里一旦判断参数的维度大于1,则会将其初始化成一个服从均匀分布的矩阵.
    for param in ():
        if () > 1:
            .xavier_uniform_(param)
    return model


if __name__ == "__main__":
    source_vocab = 12
    target_vocab = 12
    test_num_layers = 4
    result = build_transformer_model(source_vocab, target_vocab, test_num_layers)
    print(result)

EncoderDecoder(
  (encoder): TransformerEncoder(
    (encoder_layers): ModuleList(
      (0-3): 4 x EncoderLayer(
        (self_attention): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForwardLayer(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayers): ModuleList(
          (0-1): 2 x SublayerConnectionWithNormalization(
            (norm): NormalizationLayer()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (norm_layer): NormalizationLayer()
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-3): 4 x DecoderLayer(
        (self_mha_attention): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (mha_attention): MultiHeadedAttention(
          (linears): ModuleList(
            (0-3): 4 x Linear(in_features=512, out_features=512, bias=True)
          )
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feed_forward): FeedForwardLayer(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (sublayers): ModuleList(
          (0-2): 3 x SublayerConnectionWithNormalization(
            (norm): NormalizationLayer()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
    )
    (norm): NormalizationLayer()
  )
  (source_embedding): Sequential(
    (0): TextEmbeddings(
      (embedding_layer): Embedding(12, 512)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (target_embedding): Sequential(
    (0): TextEmbeddings(
      (embedding_layer): Embedding(12, 512)
    )
    (1): PositionalEncoding(
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (generator): TransformerOutput(
    (linear): Linear(in_features=512, out_features=12, bias=True)
  )
)