vit-transformer模型结构及源码解读
vit简介
vit模型是transformer在图像分类领域的⾸次成功尝试;
但是其需要基于⼤量数据去预训练模型;除了训练难度,现有的 Visual Transformer 参数量和计算量多⼤,⽐如 ViT 需要 18B FLOPs 在 ImageNet 达到 78% 左右 Top1,但是 CNN 模型如 GhostNet 只需 600M FLOPs 可以达到 79% 以上 Top1。
vit⽹络结构
源码解读
import torch
from torch import nn
from einops import rearrange, repeat
from h import Rearrange
# helpers
return t if isinstance(t,tuple)el(t, t)
# class
#PreNorm是对层进⾏归⼀化
class PreNorm(nn.Module):
def__init__(lf, dim, fn):
super().__init__()
< = nn.LayerNorm(dim)
lf.fn = fn
def forward(lf, x,**kwargs):
return lf.(x),**kwargs)
#FeedForward就是两层线性变换
class FeedForward(nn.Module):
def__init__(lf, dim, hidden_dim, dropout =0.):
super().__init__()
lf = nn.Sequential(
nn.Linear(dim, hidden_dim),
职工趣味运动会nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(lf, x):
return lf(x)细菌大战
#attention的输⼊和输出维度相同[1,num_patches+1,128]-->[num_patches+1,128],其⽬的是赋予不同patch不同的权重;#给予不同的注意⼒
class Attention(nn.Module):
def__init__(lf, dim, heads =8, dim_head =64, dropout =0.):
婴儿感冒咳嗽怎么办super().__init__()
inner_dim = dim_head * heads
project_out =not(heads ==1and dim_head == dim)
lf.heads = heads
lf.scale = dim_head **-0.5
lf.attend = nn.Softmax(dim =-1)
<_qkv = nn.Linear(dim, inner_dim *3, bias =Fal)
<_out = nn.Sequential(
nn.Linear(inner_dim, dim),
nn.Dropout(dropout)
)if project_out el nn.Identity()
def forward(lf, x):
qkv = lf.to_qkv(x).chunk(3, dim =-1)
#获得三个维度相同的向量q,k,v,然后q,k相乘获得权重,乘以scale,再经过softmax之后,乘到v上
q, k, v =map(lambda t: rearrange(t,'b n (h d) -> b h n d', h = lf.heads), qkv)
dots = torch.matmul(q, k.transpo(-1,-2))* lf.scale
attn = lf.attend(dots)
out = torch.matmul(attn, v)
out = rearrange(out,'b h n d -> b n (h d)')
_out(out)
#Transformer就是将降维后的patches叠加上不同的系数(注意⼒机制),再加上两层线性传输
class Transformer(nn.Module):
def__init__(lf, dim, depth, heads, dim_head, mlp_dim, dropout =0.):
super().__init__()
lf.layers = nn.ModuleList([])
for _ in range(depth):
lf.layers.append(nn.ModuleList([
PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),
PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))
def forward(lf, x):
for attn, ff in lf.layers:
x = attn(x)+ x
x = ff(x)+ x
return x
class ViT(nn.Module):
def__init__(lf,*, image_size, patch_size, num_class, dim, depth, heads, mlp_dim, pool ='cls', channels =3, dim_head =64, dropout =0., emb_dro pout =0.):
super().__init__()
沙漠的英文
image_height, image_width = pair(image_size)
patch_height, patch_width = pair(patch_size)
asrt image_height % patch_height ==0and image_width % patch_width ==0,'Image dimensions must be divisible by the patch size.'
num_patches =(image_height // patch_height)*(image_width // patch_width)
patch_dim = channels * patch_height * patch_width
asrt pool in{'cls','mean'},'pool type must be either cls (cls token) or mean (mean pooling)'
给公司提建议#这⾥是对块进⾏编码,将patch_height*patch_width的⼤⼩输出维度变成隐层dim
<_patch_embedding = nn.Sequential(
Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),
nn.Linear(patch_dim, dim),
清明日记)
#这⾥对每⼀个块进⾏位置编码(包含cls_tokens),位置编码的长度为128,会以相加的⽅式叠加到原始输⼊上
#关于位置编码
添加启动项
#1.固定正弦编码 - ⽆学习参数
#2.绝对位置编码 - ⼀维学习编码
#3.轴向位置编码 - ⼆维学习编码
# ⼤多数 NLP 模型(和 GPT)只使⽤ 2。是的,新的视觉 SOTA 与 GPT 的架构相同,只有细微的差别。条条⼤路通罗马。
委托代理合同
lf.pos_embedding = nn.Parameter(torch.randn(1, num_patches +1, dim))
#关于cls_token,原先在NLP中,这个主要⽤于⽣成整个句⼦的分类,但是看过⼀些解释,去掉这个,⽤平均值或者加权值代表整个句⼦的分类也是⼀样的
lf.cls_token = nn.Parameter(torch.randn(1,1, dim))
lf.dropout = nn.Dropout(emb_dropout)
lf.pool = pool
<_latent = nn.Identity()
#输出最终的分类数
lf.mlp_head = nn.Sequential(
nn.LayerNorm(dim),
nn.Linear(dim, num_class)
)
def forward(lf, img):
#对图像进⾏分块和降维编码
x = lf.to_patch_embedding(img)
b, n, _ = x.shape
#加上⼀个分类维度[1,1,128]叠加到输⼊上⾯
cls_tokens = repeat(lf.cls_token,'() n d -> b n d', b = b)
x = torch.cat((cls_tokens, x), dim=1)
x += lf.pos_embedding[:,:(n +1)]#加上位置编码
x = lf.dropout(x)
#经过注意⼒机制和两层线性变换
x = lf.transformer(x)
#对num_patches+1这个维度求均值,x的维度由[1,num_patches+1,128]-->[1,1,128]
x = x.mean(dim =1)if lf.pool =='mean'el x[:,0]
x = lf.to_latent(x)
#再经过⼀层线性变换输出维度到num_class
#128->num_class
return lf.mlp_head(x)