# MobileViTv2
from einops import rearrange
import math
def autopad(k, p=None): # kernel, padding
if p is None:
p = k // 2 if isinstance(k, int) else [x // 2 for x in k] # auto-pad
return p
# 普通卷积层
class Conv():
# Standard convolution
def __init__(self, c1, c2, k=1, s=1, p=None, g=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super().__init__()
= nn.Conv2d(c1, c2, k, s, autopad(k, p), groups=g, bias=False)
= nn.BatchNorm2d(c2)
= () if act is True else (act if isinstance(act, ) else ())
def forward(self, x):
return (((x)))
def forward_fuse(self, x):
return ((x))
# 深度可分离卷积
class DWConv(Conv):
# Depth-wise convolution class
def __init__(self, c1, c2, k=1, s=1, act=True): # ch_in, ch_out, kernel, stride, padding, groups
super().__init__(c1, c2, k, s, g=(c1, c2), act=act)
# 带bn的1×1卷积分支
def conv_1x1_bn(inp, oup):
return (
nn.Conv2d(inp, oup, 1, 1, 0, bias=False),
nn.BatchNorm2d(oup),
()
)
#---ViT部分---#
# 规范化层的类封装
class PreNorm():
def __init__(self, dim, fn):
'''
dim: 输入和输出维度
fn: 前馈网络层,选择Multi-Head Attn和MLP二者之一
'''
super().__init__()
# LayerNorm: ( a - mean(last 2 dim) ) / sqrt( var(last 2 dim) )
# 数据归一化的输入维度设定,以及保存前馈层
= (dim)
= fn
def forward(self, x, **kwargs):
return ((x), **kwargs)
# FFN
class FeedForward():
def __init__(self, dim, hidden_dim, dropout=0.):
'''
dim: 输入和输出维度
hidden_dim: 中间层的维度
dropout: dropout操作的概率参数p
'''
super().__init__()
= (
(dim, hidden_dim),
(),
(dropout),
(hidden_dim, dim),
(dropout)
)
def forward(self, x):
return (x)
# Attention
class Attention():
def __init__(self, dim, heads = 8, dim_head = 64, dropout = 0.):
super().__init__()
inner_dim = heads * dim_head
project_out = not (heads == 1 and dim_head == dim)
= heads
# 表示1/(sqrt(dim_head))用于消除误差,保证方差为1,避免向量内积过大导致的softmax将许多输出置0的情况
# 可以看原文《attention is all you need》中关于Scale Dot-Product Attention如何抑制内积过大
= dim_head ** -0.5
# dim = > 0 时,表示mask第d维度,对相同的第d维度,进行softmax
# dim = < 0 时,表示mask倒数第d维度,对相同的倒数第d维度,进行softmax
= (dim = -1)
# 生成qkv矩阵,三个矩阵被放在一起,后续会被分开
self.to_qkv = (dim, inner_dim * 3, bias = False)
# 如果是多头注意力机制则需要进行全连接和防止过拟合,否则输出不做更改
self.to_out = (
(inner_dim, dim),
(dropout)
) if project_out else ()
def forward(self, x):
# 分割成q、k、v三个矩阵
# qkv为 inner_dim * 3,其中inner_dim = heads * dim_head
qkv = self.to_qkv(x).chunk(3, dim = -1)
# qkv的维度是(3, inner_dim = heads * dim_head)
# 'b n (h d) -> b h n d' 重新按思路分离出8个头,一共8组q,k,v矩阵
# rearrange后维度变成 (3, heads, dim, dim_head)
# 经过map后,q、k、v维度变成(1, heads, dim, dim_head)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = ), qkv)
# query * key 得到对value的注意力预测,并通过向量内积缩放防止softmax无效化部分参数
# heads * dim * dim
dots = (q, (-1, -2)) *
# 对最后一个维度进行softmax后得到预测的概率值
attn = (dots)
# 乘积得到预测结果
# out -> heads * dim * dim_head
out = (attn, v)
# 重组张量,将heads维度重新还原
out = rearrange(out, 'b h n d -> b n (h d)')
return self.to_out(out)
# Transformer模块编码
class Transformer():
def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
super().__init__()
= ([])
for _ in range(depth):
(([
PreNorm(dim, Attention(dim, heads, dim_head, dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout))
]))
def forward(self, x):
for attn, ff in :
x = attn(x) + x
x = ff(x) + x
return x
# ---MobileViTv2部分--- #
# MV2模块
class MV2Block():
def __init__(self, inp, oup, stride=1, expansion=4):
super().__init__()
= stride
assert stride in [1, 2]
hidden_dim = int(inp * expansion)
self.use_res_connect = == 1 and inp == oup
if expansion == 1: # 扩张率
= (
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False),# 3×3的卷积层
nn.BatchNorm2d(hidden_dim), # BN层
(), # SiLU函数
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), # 1×1的卷积层
nn.BatchNorm2d(oup), # BN层
)
else:
= (
# pw
nn.Conv2d(inp, hidden_dim, 1, 1, 0, bias=False), # 1×1的卷积层
nn.BatchNorm2d(hidden_dim), # BN层
(), # SiLU函数
nn.Conv2d(hidden_dim, hidden_dim, 3, stride, 1, groups=hidden_dim, bias=False), # 1×1的卷积层
nn.BatchNorm2d(hidden_dim),# BN层
(), # SiLU函数
nn.Conv2d(hidden_dim, oup, 1, 1, 0, bias=False), # 1×1的卷积层
nn.BatchNorm2d(oup), # BN层
)
def forward(self, x):
if self.use_res_connect:
return x + (x)
else:
return (x)
# MobileViTv2_Block模块(核心部分)
class MobileViTv2_Block():
def __init__(self, sim_channel, dim=64, depth=2, kernel_size=3, patch_size=(2, 2), mlp_dim=int(64 * 2), dropout=0.):
super().__init__()
, = patch_size # 获取h和w
= DWConv(sim_channel, sim_channel, kernel_size) # 3×3可分离卷积
self.conv2 = conv_1x1_bn(sim_channel, dim) # 1×1的卷积层
= Transformer(dim, depth, 4, 8, mlp_dim, dropout) # Transformer进行编码操作
self.conv3 = conv_1x1_bn(dim, sim_channel) # 1×1的卷积层
self.mv2 = MV2Block(sim_channel, sim_channel) # MV2模块
def forward(self, x):
# Local representations #mg
x = (x)
x = self.conv2(x)
# Global representations #mg
_, _, h, w =
x = rearrange(x, 'b d (h ph) (w pw) -> b (ph pw) (h w) d', ph=, pw=)
x = (x)
x = rearrange(x, 'b (ph pw) (h w) d -> b d (h ph) (w pw)', h=h // , w=w // , ph=,
pw=)
x = self.conv3(x)
x = self.mv2(x)
return x