YOLOv5改进系列(24)——替换主干网络之MobileViTv3(移动端轻量化网络的进一步升级)

时间:2025-04-02 09:15:17
  • from einops import rearrange
  • def conv_1x1_bn(inp, oup):
  • return (
  • nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
  • nn.BatchNorm2d(oup),
  • ()
  • )
  • def conv_nxn_bn(inp, oup, kernal_size=3, stride=1):
  • return (
  • nn.Conv2d(inp, oup, kernal_size, stride, 1, bias=False),
  • nn.BatchNorm2d(oup),
  • ()
  • )
  • class PreNorm():
  • def __init__(self, dim, fn):
  • super().__init__()
  • = (dim)
  • = fn # mg
  • def forward(self, x, **kwargs):
  • return ((x), **kwargs)
  • class Attention():
  • def __init__(self, dim, heads=8, dim_head=64, dropout=0.):
  • super().__init__()
  • inner_dim = dim_head * heads
  • project_out = not (heads == 1 and dim_head == dim)
  • = heads
  • = dim_head ** -0.5
  • = (dim = -1)
  • self.to_qkv = (dim, inner_dim * 3, bias = False)
  • self.to_out = (
  • (inner_dim, dim),
  • (dropout)# mg
  • ) if project_out else ()
  • def forward(self, x):
  • qkv = self.to_qkv(x).chunk(3, dim=-1)
  • q, k, v = map(lambda t: rearrange(t, 'b p n (h d) -> b p h n d', h = ), qkv)
  • dots = (q, (-1, -2)) *
  • attn = (dots)
  • out = (attn, v)
  • out = rearrange(out, 'b p h n d -> b p n (h d)')
  • return self.to_out(out)
  • class FeedForward():
  • def __init__(self, dim, hidden_dim, dropout=0.):
  • super().__init__()
  • = (
  • (dim, hidden_dim),
  • (),
  • (dropout),
  • (hidden_dim, dim),
  • (dropout)
  • )
  • def forward(self, x):
  • return (x)
  • class MBTransformer():
  • def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
  • super().__init__()
  • = ([])
  • for _ in range(depth):
  • (([
  • PreNorm(dim, Attention(dim, heads, dim_head, dropout)),
  • PreNorm(dim, FeedForward(dim, mlp_dim, dropout))
  • ]))
  • def forward(self, x):
  • for attn, ff in :
  • x = attn(x) + x
  • x = ff(x) + x
  • return x
  • class MV2Block():
  • def __init__(self, inp, oup, stride=1, expansion=4):
  • super().__init__()
  • = stride
  • assert stride in [1, 2]
  • hidden_dim = int(inp * expansion)
  • self.use_res_connect = == 1 and inp == oup
  • if expansion == 1:
  • = (
  • # dw
  • nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
  • nn.BatchNorm2d(hidden_dim),
  • (),
  • # pw-linear
  • nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
  • nn.BatchNorm2d(oup),
  • )
  • else:
  • = (
  • # pw
  • nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False),
  • nn.BatchNorm2d(hidden_dim),
  • (),
  • # dw
  • nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),
  • nn.BatchNorm2d(hidden_dim),
  • (),
  • # pw-linear
  • nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False),
  • nn.BatchNorm2d(oup),
  • )
  • def forward(self, x):
  • if self.use_res_connect:
  • return x + (x)
  • else:
  • return (x)
  • class MobileViTv3_block():
  • def __init__(self, channel, dim, depth=2, kernel_size=3, patch_size=(2, 2), mlp_dim=int(64*2), dropout=0.):
  • super().__init__()
  • , = patch_size
  • self.mv01 = MV2Block(channel, channel)
  • self.conv1 = conv_nxn_bn(channel, channel, kernel_size)
  • self.conv3 = conv_1x1_bn(dim, channel)
  • self.conv2 = conv_1x1_bn(channel, dim)
  • = MBTransformer(dim, depth, 4, 8, mlp_dim, dropout)
  • self.conv4 = conv_nxn_bn(2 * channel, channel, kernel_size)
  • def forward(self, x):
  • y = ()
  • x = self.conv1(x)
  • x = self.conv2(x)
  • z = ()
  • _, _, h, w =
  • x = rearrange(x, 'b d (h ph) (w pw) -> b (ph pw) (h w) d', ph=, pw=)
  • x = (x)
  • x = rearrange(x, 'b (ph pw) (h w) d -> b d (h ph) (w pw)', h=h//, w=w//, ph=, pw=)
  • x = self.conv3(x)
  • x = ((x, z), 1)
  • x = self.conv4(x)
  • x = x + y
  • x = self.mv01(x)
  • return x