vit-transformer模型结构及源码解读

更新时间:2023-07-07 08:17:32 阅读：评论：0

vit简介

vit模型是transformer在图像分类领域的⾸次成功尝试；

但是其需要基于⼤量数据去预训练模型；除了训练难度，现有的 Visual Transformer 参数量和计算量多⼤，⽐如 ViT 需要 18B FLOPs 在 ImageNet 达到 78% 左右 Top1，但是 CNN 模型如 GhostNet 只需 600M FLOPs 可以达到 79% 以上 Top1。

vit⽹络结构

源码解读

import torch

from torch import nn

from einops import rearrange, repeat

from h import Rearrange

# helpers

return t if isinstance(t,tuple)el(t, t)

# class

#PreNorm是对层进⾏归⼀化

class PreNorm(nn.Module):

def__init__(lf, dim, fn):

super().__init__()

< = nn.LayerNorm(dim)

lf.fn = fn

def forward(lf, x,**kwargs):

return lf.(x),**kwargs)

#FeedForward就是两层线性变换

class FeedForward(nn.Module):

def__init__(lf, dim, hidden_dim, dropout =0.):

super().__init__()

lf = nn.Sequential(

nn.Linear(dim, hidden_dim),

职工趣味运动会nn.GELU(),

nn.Dropout(dropout),

nn.Linear(hidden_dim, dim),

nn.Dropout(dropout)

)

def forward(lf, x):

return lf(x)细菌大战

#attention的输⼊和输出维度相同[1,num_patches+1,128]-->[num_patches+1,128]，其⽬的是赋予不同patch不同的权重；#给予不同的注意⼒

class Attention(nn.Module):

def__init__(lf, dim, heads =8, dim_head =64, dropout =0.):

婴儿感冒咳嗽怎么办super().__init__()

inner_dim = dim_head * heads

project_out =not(heads ==1and dim_head == dim)

lf.heads = heads

lf.scale = dim_head **-0.5

lf.attend = nn.Softmax(dim =-1)

<_qkv = nn.Linear(dim, inner_dim *3, bias =Fal)

<_out = nn.Sequential(

nn.Linear(inner_dim, dim),

nn.Dropout(dropout)

)if project_out el nn.Identity()

def forward(lf, x):

qkv = lf.to_qkv(x).chunk(3, dim =-1)

#获得三个维度相同的向量q,k,v，然后q,k相乘获得权重，乘以scale，再经过softmax之后，乘到v上

q, k, v =map(lambda t: rearrange(t,'b n (h d) -> b h n d', h = lf.heads), qkv)

dots = torch.matmul(q, k.transpo(-1,-2))* lf.scale

attn = lf.attend(dots)

out = torch.matmul(attn, v)

out = rearrange(out,'b h n d -> b n (h d)')

_out(out)

#Transformer就是将降维后的patches叠加上不同的系数（注意⼒机制），再加上两层线性传输

class Transformer(nn.Module):

def__init__(lf, dim, depth, heads, dim_head, mlp_dim, dropout =0.):

super().__init__()

lf.layers = nn.ModuleList([])

for _ in range(depth):

lf.layers.append(nn.ModuleList([

PreNorm(dim, Attention(dim, heads = heads, dim_head = dim_head, dropout = dropout)),

PreNorm(dim, FeedForward(dim, mlp_dim, dropout = dropout))

def forward(lf, x):

for attn, ff in lf.layers:

x = attn(x)+ x

x = ff(x)+ x

return x

class ViT(nn.Module):

def__init__(lf,*, image_size, patch_size, num_class, dim, depth, heads, mlp_dim, pool ='cls', channels =3, dim_head =64, dropout =0., emb_dro pout =0.):

super().__init__()

沙漠的英文

image_height, image_width = pair(image_size)

patch_height, patch_width = pair(patch_size)

asrt image_height % patch_height ==0and image_width % patch_width ==0,'Image dimensions must be divisible by the patch size.'

num_patches =(image_height // patch_height)*(image_width // patch_width)

patch_dim = channels * patch_height * patch_width

asrt pool in{'cls','mean'},'pool type must be either cls (cls token) or mean (mean pooling)'

给公司提建议#这⾥是对块进⾏编码，将patch_height*patch_width的⼤⼩输出维度变成隐层dim

<_patch_embedding = nn.Sequential(

Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1 = patch_height, p2 = patch_width),

nn.Linear(patch_dim, dim),

清明日记)

#这⾥对每⼀个块进⾏位置编码(包含cls_tokens)，位置编码的长度为128，会以相加的⽅式叠加到原始输⼊上

#关于位置编码

添加启动项

#1.固定正弦编码 - ⽆学习参数

#2.绝对位置编码 - ⼀维学习编码

#3.轴向位置编码 - ⼆维学习编码

# ⼤多数 NLP 模型（和 GPT）只使⽤ 2。是的，新的视觉 SOTA 与 GPT 的架构相同，只有细微的差别。条条⼤路通罗马。

委托代理合同

lf.pos_embedding = nn.Parameter(torch.randn(1, num_patches +1, dim))

#关于cls_token，原先在NLP中，这个主要⽤于⽣成整个句⼦的分类，但是看过⼀些解释，去掉这个，⽤平均值或者加权值代表整个句⼦的分类也是⼀样的

lf.cls_token = nn.Parameter(torch.randn(1,1, dim))

lf.dropout = nn.Dropout(emb_dropout)

lf.pool = pool

<_latent = nn.Identity()

#输出最终的分类数

lf.mlp_head = nn.Sequential(

nn.LayerNorm(dim),

nn.Linear(dim, num_class)

)

def forward(lf, img):

#对图像进⾏分块和降维编码

x = lf.to_patch_embedding(img)

b, n, _ = x.shape

#加上⼀个分类维度[1,1,128]叠加到输⼊上⾯

cls_tokens = repeat(lf.cls_token,'() n d -> b n d', b = b)

x = torch.cat((cls_tokens, x), dim=1)

x += lf.pos_embedding[:,:(n +1)]#加上位置编码

x = lf.dropout(x)

#经过注意⼒机制和两层线性变换

x = lf.transformer(x)

#对num_patches+1这个维度求均值，x的维度由[1,num_patches+1,128]-->[1,1,128]

x = x.mean(dim =1)if lf.pool =='mean'el x[:,0]

x = lf.to_latent(x)

#再经过⼀层线性变换输出维度到num_class

#128->num_class

return lf.mlp_head(x)

本文发布于:2023-07-07 08:17:32，感谢您对本站的认可！

本文链接：https://www.wtabcd.cn/fanwen/fan/89/1071347.html

上一篇：外研版小学英语(新标准一起)三年级下册M6测试题

下一篇：利用精益生产的四个基本原则改善流程管理

标签：编码维度位置叠加加上模型分类输出

留言与评论（共有 0 条评论）