YOLOv5改进系列(23)——替换主干网络之MobileViTv2(移动视觉 Transformer 的高效可分离自注意力机制)

时间:2025-04-02 09:15:44
  • # MobileViTv2
  • from einops import rearrange
  • import math
  • def autopad(k, p=None): # kernel, padding
  • if p is None:
  • p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
  • return p
  • # 普通卷积层
  • class Conv():
  • # Standard convolution
  • def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
  • super().__init__()
  • = nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
  • = nn.BatchNorm2d(c2)
  • = () if act is True else (act if isinstance(act, ) else ())
  • def forward(self, x):
  • return (((x)))
  • def forward_fuse(self, x):
  • return ((x))
  • # 深度可分离卷积
  • class DWConv(Conv):
  • # Depth-wise convolution class
  • def __init__(self, c1, c2, k=1, s=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
  • super().__init__(c1, c2, k, s, g=(c1, c2), act=act)
  • # 带bn的1×1卷积分支
  • def conv_1x1_bn(inp, oup):
  • return (
  • nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
  • nn.BatchNorm2d(oup),
  • ()
  • )
  • #---ViT部分---#
  • # 规范化层的类封装
  • class PreNorm():
  • def __init__(self, dim, fn):
  • '''
  • dim: 输入和输出维度
  • fn: 前馈网络层,选择Multi-Head Attn和MLP二者之一
  • '''
  • super().__init__()
  • # LayerNorm: ( a - mean(last 2 dim) ) / sqrt( var(last 2 dim) )
  • # 数据归一化的输入维度设定,以及保存前馈层
  • = (dim)
  • = fn
  • def forward(self, x, **kwargs):
  • return ((x), **kwargs)
  • # FFN
  • class FeedForward():
  • def __init__(self, dim, hidden_dim, dropout=0.):
  • '''
  • dim: 输入和输出维度
  • hidden_dim: 中间层的维度
  • dropout: dropout操作的概率参数p
  • '''
  • super().__init__()
  • = (
  • (dim, hidden_dim),
  • (),
  • (dropout),
  • (hidden_dim, dim),
  • (dropout)
  • )
  • def forward(self, x):
  • return (x)
  • # Attention
  • class Attention():
  • def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
  • super().__init__()
  • inner_dim = heads * dim_head
  • project_out = not (heads == 1 and dim_head == dim)
  • = heads
  • # 表示1/(sqrt(dim_head))用于消除误差,保证方差为1,避免向量内积过大导致的softmax将许多输出置0的情况
  • # 可以看原文《attention is all you need》中关于Scale Dot-Product Attention如何抑制内积过大
  • = dim_head ** -0.5
  • # dim = > 0 时,表示mask第d维度,对相同的第d维度,进行softmax
  • # dim = < 0 时,表示mask倒数第d维度,对相同的倒数第d维度,进行softmax
  • = (dim = -1)
  • # 生成qkv矩阵,三个矩阵被放在一起,后续会被分开
  • self.to_qkv = (dim, inner_dim * 3, bias = False)
  • # 如果是多头注意力机制则需要进行全连接和防止过拟合,否则输出不做更改
  • self.to_out = (
  • (inner_dim, dim),
  • (dropout)
  • ) if project_out else ()
  • def forward(self, x):
  • # 分割成q、k、v三个矩阵
  • # qkv为 inner_dim * 3,其中inner_dim = heads * dim_head
  • qkv = self.to_qkv(x).chunk(3, dim = -1)
  • # qkv的维度是(3, inner_dim = heads * dim_head)
  • # 'b n (h d) -> b h n d' 重新按思路分离出8个头,一共8组q,k,v矩阵
  • # rearrange后维度变成 (3, heads, dim, dim_head)
  • # 经过map后,q、k、v维度变成(1, heads, dim, dim_head)
  • q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = ), qkv)
  • # query * key 得到对value的注意力预测,并通过向量内积缩放防止softmax无效化部分参数
  • # heads * dim * dim
  • dots = (q, (-1, -2)) *
  • # 对最后一个维度进行softmax后得到预测的概率值
  • attn = (dots)
  • # 乘积得到预测结果
  • # out -> heads * dim * dim_head
  • out = (attn, v)
  • # 重组张量,将heads维度重新还原
  • out = rearrange(out, 'b h n d -> b n (h d)')
  • return self.to_out(out)
  • # Transformer模块编码
  • class Transformer():
  • def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
  • super().__init__()
  • = ([])
  • for _ in range(depth):
  • (([
  • PreNorm(dim, Attention(dim, heads, dim_head, dropout)),
  • PreNorm(dim, FeedForward(dim, mlp_dim, dropout))
  • ]))
  • def forward(self, x):
  • for attn, ff in :
  • x = attn(x) + x
  • x = ff(x) + x
  • return x
  • # ---MobileViTv2部分--- #
  • # MV2模块
  • class MV2Block():
  • def __init__(self, inp, oup, stride=1, expansion=4):
  • super().__init__()
  • = stride
  • assert stride in [1, 2]
  • hidden_dim = int(inp * expansion)
  • self.use_res_connect = == 1 and inp == oup
  • if expansion == 1: # 扩张率
  • = (
  • nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),# 3×3的卷积层
  • nn.BatchNorm2d(hidden_dim), # BN层
  • (), # SiLU函数
  • nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), # 1×1的卷积层
  • nn.BatchNorm2d(oup), # BN层
  • )
  • else:
  • = (
  • # pw
  • nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), # 1×1的卷积层
  • nn.BatchNorm2d(hidden_dim), # BN层
  • (), # SiLU函数
  • nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), # 1×1的卷积层
  • nn.BatchNorm2d(hidden_dim),# BN层
  • (), # SiLU函数
  • nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), # 1×1的卷积层
  • nn.BatchNorm2d(oup), # BN层
  • )
  • def forward(self, x):
  • if self.use_res_connect:
  • return x + (x)
  • else:
  • return (x)
  • # MobileViTv2_Block模块(核心部分)
  • class MobileViTv2_Block():
  • def __init__(self, sim_channel, dim=64, depth=2, kernel_size=3, patch_size=(2, 2), mlp_dim=int(64 * 2), dropout=0.):
  • super().__init__()
  • , = patch_size # 获取h和w
  • = DWConv(sim_channel, sim_channel, kernel_size) # 3×3可分离卷积
  • self.conv2 = conv_1x1_bn(sim_channel, dim) # 1×1的卷积层
  • = Transformer(dim, depth, 4, 8, mlp_dim, dropout) # Transformer进行编码操作
  • self.conv3 = conv_1x1_bn(dim, sim_channel) # 1×1的卷积层
  • self.mv2 = MV2Block(sim_channel, sim_channel) # MV2模块
  • def forward(self, x):
  • # Local representations #mg
  • x = (x)
  • x = self.conv2(x)
  • # Global representations #mg
  • _, _, h, w =
  • x = rearrange(x, 'b d (h ph) (w pw) -> b (ph pw) (h w) d', ph=, pw=)
  • x = (x)
  • x = rearrange(x, 'b (ph pw) (h w) d -> b d (h ph) (w pw)', h=h // , w=w // , ph=,
  • pw=)
  • x = self.conv3(x)
  • x = self.mv2(x)
  • return x