【pytorchtorchvision源码解读系列—3】InceptionV3
框架中有⼀个⾮常重要且好⽤的包:torchvision,顾名思义这个包主要是关于计算机视觉cv的。这个包主要由3个⼦包组成,分别是:torchvision.datats、dels、ansforms。
承接上⼀篇,今天来看看inception V3的pytorch实现。
⾸先因为有很多卷积的操作是重复的,所以定义了⼀个BasicConv2d的类,
class BasicConv2d(nn.Module):
def __init__(lf, in_channels, out_channels, **kwargs):
super(BasicConv2d, lf).__init__()
lf.bn = nn.BatchNorm2d(out_channels, eps=0.001)
def forward(lf, x):
x = lf.conv(x)
x = lf.bn(x)
lu(x, inplace=True)
这个类实现了最基本的卷积加上BN的操作,因为in_channels和out_channels是我们可以⾃⼰定义的,⽽且**kwargs的意思是能接收多个赋值,这也意味着我们我可以定义卷积的stride⼤⼩,padding的⼤⼩等等。我们将会在下⾯的inception模块中不断复⽤这个类。
然后inception系列的⽹络架构最最重点对的当然是module的构建,这⾥实现了inceptionA~E五种不同结构的inception module,但是我发现并没有在原论⽂⾥⾯完全⼀样,可能是实现的时候修改了吧。不管怎么样,module的样⼦⼤概就是下图这样:
来看看这个inceptionA。这⾥的结构⼤致是⼀个module⾥⾯有四个分⽀,__init__⾥⾯就是结构的定义。第⼀个分⽀是branch1,只有⼀个1*1的卷积;第⼆个分⽀是两个5*5的卷积;第三个分⽀是三个3*3的卷积;⽽第四个分⽀没有卷积,是⼀个简单的pooling。你可能会有疑问为什么不同的卷积核的输出⼤⼩是⼀样⼤,因为这⾥特别的针对每个分⽀有不同的padding(零填充),然后每个分⽀stride的步数都为1,最后就回输出⼤⼩相同的卷积结果。
值得我们注意的是最后outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]这个操作就是将不同的分⽀都concaternation相结合在⼀起。
class InceptionA(nn.Module):
def __init__(lf, in_channels, pool_features):
super(InceptionA, lf).__init__()
lf.branch1x1 = BasicConv2d(in_channels, 64, kernel_size=1)
lf.branch5x5_1 = BasicConv2d(in_channels, 48, kernel_size=1)
lf.branch5x5_2 = BasicConv2d(48, 64, kernel_size=5, padding=2)
lf.branch3x3dbl_1 = BasicConv2d(in_channels, 64, kernel_size=1)
lf.branch3x3dbl_2 = BasicConv2d(64, 96, kernel_size=3, padding=1)
lf.branch3x3dbl_3 = BasicConv2d(96, 96, kernel_size=3, padding=1)
lf.branch_pool = BasicConv2d(in_channels, pool_features, kernel_size=1)
def forward(lf, x):
branch1x1 = lf.branch1x1(x)跳水兔的做法
branch5x5 = lf.branch5x5_1(x)
branch5x5 = lf.branch5x5_2(branch5x5)
branch3x3dbl = lf.branch3x3dbl_1(x)
branch3x3dbl = lf.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = lf.branch3x3dbl_3(branch3x3dbl)
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = lf.branch_pool(branch_pool)
outputs = [branch1x1, branch5x5, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
同理其他的module也是⼤同⼩异,这⾥就不多说了。我们来看⼀下特别的network in network in network结构,这⾥的意思是有⼀个特殊的module它⾥⾯有两重分⽀。
在这⾥这个分⽀叫InceptionE。下⾯完整的代码可以看到在第⼆个分⽀lf.branch3x3_1后⾯有两个层lf.branch3x3_2a和
lf.branch3x3_2b,他们就是在第⼀层传递之后第⼆层分叉了,最后⼜在重点结合在⼀起。怎么做到的呢?
branch3x3 = [
lf.branch3x3_2a(branch3x3),
lf.branch3x3_2b(branch3x3),
]
这⾥就是将两个结果合并在⼀起,最后再做⼀次最后的合并:
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
branch3x3 = torch.cat(branch3x3, 1)
省简称大全class InceptionE(nn.Module):
def __init__(lf, in_channels):
super(InceptionE, lf).__init__()
lf.branch1x1 = BasicConv2d(in_channels, 320, kernel_size=1)
lf.branch3x3_1 = BasicConv2d(in_channels, 384, kernel_size=1)
lf.branch3x3_2a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
lf.branch3x3_2b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
lf.branch3x3dbl_1 = BasicConv2d(in_channels, 448, kernel_size=1)
lf.branch3x3dbl_2 = BasicConv2d(448, 384, kernel_size=3, padding=1)
lf.branch3x3dbl_3a = BasicConv2d(384, 384, kernel_size=(1, 3), padding=(0, 1))
lf.branch3x3dbl_3b = BasicConv2d(384, 384, kernel_size=(3, 1), padding=(1, 0))
lf.branch_pool = BasicConv2d(in_channels, 192, kernel_size=1)
def forward(lf, x):
branch1x1 = lf.branch1x1(x)
branch3x3 = lf.branch3x3_1(x)
branch3x3 = [
lf.branch3x3_2a(branch3x3),
lf.branch3x3_2b(branch3x3),
]
branch3x3 = torch.cat(branch3x3, 1)
branch3x3dbl = lf.branch3x3dbl_1(x)
branch3x3dbl = lf.branch3x3dbl_2(branch3x3dbl)
branch3x3dbl = [
lf.branch3x3dbl_3a(branch3x3dbl),
lf.branch3x3dbl_3b(branch3x3dbl),
]
branch3x3dbl = torch.cat(branch3x3dbl, 1)
branch_pool = F.avg_pool2d(x, kernel_size=3, stride=1, padding=1)
branch_pool = lf.branch_pool(branch_pool)
outputs = [branch1x1, branch3x3, branch3x3dbl, branch_pool]
return torch.cat(outputs, 1)
此外还有⼀个⽐较特殊的结构是辅助分类结构,这就是在完整⽹络中间某层输出结果以⼀定的⽐例添加到最终结果分类的意思。他跟⽹络最后的分类是类似的,只是他是在中间分⽀出来的辅助结果。结构是卷积到⼀层线性分类,没有之前VGG版本Alexnet版本的全连接,参数⼤⼤减少。
class InceptionAux(nn.Module):
def __init__(lf, in_channels, num_class):
super(InceptionAux, lf).__init__()
lf.fc = nn.Linear(768, num_class)
lf.fc.stddev = 0.001
def forward(lf, x):
# 17 x 17 x 768
x = F.avg_pool2d(x, kernel_size=5, stride=3)
# 5 x 5 x 768
x = lf.conv0(x)
# 5 x 5 x 128
x = lf.conv1(x)
# 1 x 1 x 768
x = x.view(x.size(0), -1)
# 768
x = lf.fc(x)
# 1000
return x
最后来看看Inception V3的完整结构吧。__init__函数⾥定义⽹络的结构,有哪些基本模块,并且对权重初始化。foward函数定义了输⼊数据的流动⽅向,基本上就是前⾯的只有卷积层,后⾯开始使⽤不同的inception module,最后⼀层linear线性输出结果。⽽如果使⽤
aux_logits就会添加辅助分类结构,最后返回的结果也会包括辅助分类的结果。
class Inception3(nn.Module):
def __init__(lf, num_class=1000, aux_logits=True, transform_input=Fal):
super(Inception3, lf).__init__()
lf.aux_logits = aux_logits
lf.Conv2d_1a_3x3 = BasicConv2d(3, 32, kernel_size=3, stride=2)
lf.Conv2d_2a_3x3 = BasicConv2d(32, 32, kernel_size=3)
lf.Conv2d_2b_3x3 = BasicConv2d(32, 64, kernel_size=3, padding=1)
lf.Conv2d_3b_1x1 = BasicConv2d(64, 80, kernel_size=1)
lf.Conv2d_4a_3x3 = BasicConv2d(80, 192, kernel_size=3)
lf.Mixed_5b = InceptionA(192, pool_features=32)
lf.Mixed_5c = InceptionA(256, pool_features=64)
lf.Mixed_5d = InceptionA(288, pool_features=64)
lf.Mixed_6a = InceptionB(288)
lf.Mixed_6b = InceptionC(768, channels_7x7=128)
lf.Mixed_6c = InceptionC(768, channels_7x7=160)
lf.Mixed_6d = InceptionC(768, channels_7x7=160)
lf.Mixed_6e = InceptionC(768, channels_7x7=192)
if aux_logits:
吐司吃法lf.AuxLogits = InceptionAux(768, num_class)
非成语>度字组词lf.Mixed_7a = InceptionD(768)
lf.Mixed_7b = InceptionE(1280)
lf.Mixed_7c = InceptionE(2048)
lf.fc = nn.Linear(2048, num_class)
for m dules():
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):值不值得
import scipy.stats as stats
stddev = m.stddev if hasattr(m, 'stddev') el 0.1
X = uncnorm(-2, 2, scale=stddev)
values = torch.Tensor(X.rvs(m.weight.numel()))
values = values.view(m.weight.size())
m.py_(values)
elif isinstance(m, nn.BatchNorm2d):
elif isinstance(m, nn.BatchNorm2d):
stant_(m.weight, 1)
stant_(m.bias, 0)
def forward(lf, x):
ansform_input:
x_ch0 = torch.unsqueeze(x[:, 0], 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5 x_ch1 = torch.unsqueeze(x[:, 1], 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5 x_ch2 = torch.unsqueeze(x[:, 2], 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5 x = torch.cat((x_ch0, x_ch1, x_ch2), 1)
# 299 x 299 x 3
x = lf.Conv2d_1a_3x3(x)
# 149 x 149 x 32
x = lf.Conv2d_2a_3x3(x)
# 147 x 147 x 32
x = lf.Conv2d_2b_3x3(x)
# 147 x 147 x 64
x = F.max_pool2d(x, kernel_size=3, stride=2)
# 73 x 73 x 64
x = lf.Conv2d_3b_1x1(x)
# 73 x 73 x 80
x = lf.Conv2d_4a_3x3(x)
# 71 x 71 x 192
x = F.max_pool2d(x, kernel_size=3, stride=2)
黄土地貌
# 35 x 35 x 192
x = lf.Mixed_5b(x)
# 35 x 35 x 256
x = lf.Mixed_5c(x)
# 35 x 35 x 288
x = lf.Mixed_5d(x)
# 35 x 35 x 288
x = lf.Mixed_6a(x)
# 17 x 17 x 768
x = lf.Mixed_6b(x)
# 17 x 17 x 768
x = lf.Mixed_6c(x)
# 17 x 17 x 768
x = lf.Mixed_6d(x)
# 17 x 17 x 768
x = lf.Mixed_6e(x)
# 17 x 17 x 768
aining and lf.aux_logits:
aux = lf.AuxLogits(x)
# 17 x 17 x 768
x = lf.Mixed_7a(x)
# 8 x 8 x 1280
x = lf.Mixed_7b(x)
# 8 x 8 x 2048
x = lf.Mixed_7c(x)
# 8 x 8 x 2048
x = F.avg_pool2d(x, kernel_size=8)
# 1 x 1 x 2048
x = F.dropout(x, aining)
# 1 x 1 x 2048
x = x.view(x.size(0), -1)
# 2048
x = lf.fc(x)
# 1000 (num_class)
aining and lf.aux_logits:
原料采购return x, aux
return x
最后贴上完整的代码: