YOLOv5改进系列(31)——添加Dual-ViT注意力机制(TPAMI 2023|京东提出多尺度双视觉Transformer,降低计算开销)

时间:2025-04-02 09:12:54
  • #DualViT
  • import torch
  • import as nn
  • import as F
  • from functools import partial
  • from import DropPath, to_2tuple, trunc_normal_
  • import math
  • # ========== 类:深度可分离卷积 ==========
  • class DWConv():
  • def __init__(self, dim=768):
  • super(DWConv, self).__init__()
  • = nn.Conv2d(dim, dim, 3, 1, 1, bias=True, groups=dim)
  • def forward(self, x, H, W):
  • B, N, C =
  • x = (1, 2).view(B, C, H, W)
  • x = (x)
  • x = (2).transpose(1, 2)
  • return x
  • # ========== 2.PVT2FFN类:组合了线性层、深度可分离卷积层和激活函数,适用于处理输入特征并输出经过一系列线性和非线性变换后的结果。 ==========
  • class PVT2FFN():
  • def __init__(self, in_features, hidden_features):
  • super().__init__()
  • self.fc1 = (in_features, hidden_features)
  • = DWConv(hidden_features)
  • = ()
  • self.fc2 = (hidden_features, in_features)
  • (self._init_weights)
  • def _init_weights(self, m):
  • if isinstance(m, ):
  • trunc_normal_(, std=.02)
  • if isinstance(m, ) and is not None:
  • .constant_(, 0)
  • elif isinstance(m, ):
  • if is not None:
  • .constant_(, 0)
  • if is not None:
  • .constant_(, 1.0)
  • elif isinstance(m, nn.Conv2d):
  • fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
  • fan_out //=
  • .normal_(0, (2.0 / fan_out))
  • if is not None:
  • .zero_()
  • def forward(self, x, H, W):
  • x = self.fc1(x)
  • x = (x, H, W)
  • x = (x)
  • x = self.fc2(x)
  • return x
  • # ========== 类:特征融合和转换 ==========
  • class MergeFFN():
  • def __init__(self, in_features, hidden_features):
  • super().__init__()
  • self.fc1 = (in_features, hidden_features)
  • = DWConv(hidden_features)
  • = ()
  • self.fc2 = (hidden_features, in_features)
  • self.fc_proxy = (
  • (in_features, 2 * in_features),
  • (),
  • (2 * in_features, in_features),
  • )
  • (self._init_weights)
  • def _init_weights(self, m):
  • if isinstance(m, ):
  • trunc_normal_(, std=.02)
  • if isinstance(m, ) and is not None:
  • .constant_(, 0)
  • elif isinstance(m, ):
  • if is not None:
  • .constant_(, 0)
  • if is not None:
  • .constant_(, 1.0)
  • elif isinstance(m, nn.Conv2d):
  • fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
  • fan_out //=
  • .normal_(0, (2.0 / fan_out))
  • if is not None:
  • .zero_()
  • def forward(self, x, H, W):
  • x, semantics = (x, [H * W, [1] - H * W], dim=1)
  • semantics = self.fc_proxy(semantics)
  • x = self.fc1(x)
  • x = (x, H, W)
  • x = (x)
  • x = self.fc2(x)
  • x = ([x, semantics], dim=1)
  • return x
  • # ========== 类:注意力机制 ==========
  • class Attention():
  • def __init__(self, dim, num_heads):
  • super().__init__()
  • assert dim % num_heads == 0, f"dim {dim} should be divided by num_heads {num_heads}."
  • = dim
  • self.num_heads = num_heads
  • head_dim = dim // num_heads
  • = head_dim ** -0.5
  • = (dim, dim)
  • = (dim, dim * 2)
  • = (dim, dim)
  • (self._init_weights)
  • def _init_weights(self, m):
  • if isinstance(m, ):
  • trunc_normal_(, std=.02)
  • if isinstance(m, ) and is not None:
  • .constant_(, 0)
  • elif isinstance(m, ):
  • if is not None:
  • .constant_(, 0)
  • if is not None:
  • .constant_(, 1.0)
  • elif isinstance(m, nn.Conv2d):
  • fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
  • fan_out //=
  • .normal_(0, (2.0 / fan_out))
  • if is not None:
  • .zero_()
  • def forward(self, x):
  • # x =(3, 0, 1, 2)
  • B, H, W, C =
  • N = H * W
  • q = (x).reshape(B, N, self.num_heads, C // self.num_heads).permute(0, 2, 1, 3)
  • kv = (x).reshape(B, -1, 2, self.num_heads, C // self.num_heads).permute(2, 0, 3, 1, 4)
  • k, v = kv[0], kv[1]
  • attn = (q @ (-2, -1)) *
  • attn = (dim=-1)
  • x = (attn @ v).transpose(1, 2).reshape(B, H, W , C)
  • x = (x)
  • return x
  • # ========== 类:本文核心方法 ==========
  • class MergeBlockattention():
  • def __init__(self,input, dim, num_heads=2, mlp_ratio=8, drop_path=0., norm_layer=, is_last=False):
  • super().__init__()
  • self.norm1 = norm_layer(dim)
  • self.norm2 = norm_layer(dim)
  • = Attention(dim, num_heads)
  • '''
  • self.norm1: 第一个归一化层 (),对输入数据进行归一化处理。
  • self.norm2: 第二个归一化层,同样用于归一化处理。
  • : 注意力机制模块 (Attention),用于计算输入的注意力表示。
  • : 根据 is_last 参数选择不同的多层感知机模块。如果 is_last 为 True,则使用
  • PVT2FFN;否则使用之前定义的 MergeFFN。
  • self.is_last: 标志位,指示是否是最后一个模块。
  • self.drop_path: 如果指定了 drop_path 大于 0,则应用一定概率的 DropPath,否则使用恒等
  • 映射 (()).
  • self.gamma1, self.gamma2: 可学习的参数 gamma,用于调节归一化后的特征。
  • '''
  • if is_last:
  • = PVT2FFN(in_features=dim, hidden_features=int(dim * mlp_ratio))
  • else:
  • = MergeFFN(in_features=dim, hidden_features=int(dim * mlp_ratio))
  • self.is_last = is_last
  • self.drop_path = DropPath(drop_path) if drop_path > 0. else ()
  • layer_scale_init_value = 1e-6
  • self.gamma1 = (layer_scale_init_value * ((dim)),
  • requires_grad=True) if layer_scale_init_value > 0 else None
  • self.gamma2 = (layer_scale_init_value * ((dim)),
  • requires_grad=True) if layer_scale_init_value > 0 else None
  • (self._init_weights)
  • def _init_weights(self, m):
  • if isinstance(m, ):
  • trunc_normal_(, std=.02)
  • if isinstance(m, ) and is not None:
  • .constant_(, 0)
  • elif isinstance(m, ):
  • if is not None:
  • .constant_(, 0)
  • if is not None:
  • .constant_(, 1.0)
  • elif isinstance(m, nn.Conv2d):
  • fan_out = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
  • fan_out //=
  • .normal_(0, (2.0 / fan_out))
  • if is not None:
  • .zero_()
  • def forward(self, x):
  • B, C, H, W =
  • x = (0, 2, 3, 1)
  • #x = x + self.drop_path(self.gamma1 * (self.norm1(x)))
  • x =(self.norm1(x))
  • x = (0, 3, 2, 1)
  • return x