神经网络-YoloV3复现（网络搭建篇）

一、前言

系统：win10
环境：python3.7
框架：pytorch1.1

完整代码会在最后给出

二、Darknet网络模版

2.1 模板说明

网络模板如下图所示，YoloV3只借鉴到了前面的卷积部分（有修改），后面的Avgpool、Connected、Softmax层去掉。注意事项

每一个Convolutional内都包含Con2d、Bn、LeakyRelu三个部分（这是固定搭配），为了方便使用，建议单独实现
每一个Residual都包含两个Convolutional

2.2 部分代码实现

Convolutional单独实现

class Conv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, bias):
        super(Conv2d, self).__init__()
        self.conv = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size,stride=stride,padding=padding,bias=bias),
                nn.BatchNorm2d(out_channels),
                nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        return self.conv(x)

Residual单独实现

class BasicBlock(nn.Module):
    def __init__(self, inplanes, planes):
        super(BasicBlock, self).__init__()

        self.Conv1 = Conv2d(inplanes, planes[0], kernel_size=1, stride=1, padding=0, bias=False)
        self.Conv2 = Conv2d(planes[0], planes[1], kernel_size=3, stride=1, padding=1, bias=False)

    def forward(self, x):
        residual = x

        out = self.Conv1(x)
        out = self.Conv2(out)

        out += residual
        return out

三、YoloV3网络结构

（找了好久才找齐这两幅图片——darknet和YoloV3改进，单独看一个都不好看懂）

3.1 网络说明

网络如下所示¹，前面的一部分借鉴了darknet53结构，后面从Convolutional Set开始由YoloV3作者自己改进，有三个输出，添加了上采样。具体原理可以参考论文或者其他优秀博客

yolo系列之yolo v3【深度解析】里面有张神图，转载需要版权，故需要点进去看

3.2 部分代码实现

3.2.1 Convolutional Set

class ConvSet(nn.Module):  # inplanes->inplanes
    def __init__(self, inplanes, outplanes):
        super(ConvSet, self).__init__()
        self.convset = nn.Sequential(
                Conv2d(inplanes, outplanes, 1, 1, 0, False),
                Conv2d(outplanes, outplanes, 3, 1, 1, False),
                Conv2d(outplanes, outplanes * 2, 1, 1, 0, False),
                Conv2d(outplanes * 2, outplanes * 2, 3, 1, 1, False),
                Conv2d(outplanes * 2, outplanes, 1, 1, 0, False)
        )

    def forward(self, x):
        return self.convset(x)

看到上述的代码实现，心中涌现几个问题：

图中信息只给出了卷积核的大小，并不知道填充位（padding）
图中也没有给出中间层输入通道和输出通道的数量，这些该怎么确定

在没有完整的看完论文之前有以下猜测

第一个卷积一般按照给定的输入输出（inplanes, outplanes），卷积核为1
如果卷积为3，则输入和输出一样，填充（padding）此时为1。（这应该是检测功能实现部位）
后面再遇到卷积核为1，则需要进行通道变换（放大，或者缩小）。（这应该是选择功能实现部位）

看完论文后，我想打人！！！没有说网络是怎么建立的！

3.2.2 UpSampling

class Upsampling(nn.Module):
    def __init__(self):
        super(Upsampling, self).__init__()

    def forward(self, x):
        # interpolate 上采样专用函数，scale_factor放大倍数，mode插值模式
        return nn.functional.interpolate(x, scale_factor=2, mode='nearest')

四、总结

根据图片和论文其实无法得到每一个卷积层具体的参数，估计需要有一套自己对卷积网络的理解才能独自建立网络。对于复现来说，最难的不是网络搭建，而是loss损失函数设计和数据集设计，见后文。

五、完整代码

代码

# -*- coding: utf-8 -*-
# @Time    : 2019/8/23 18:07
# @Author  : zwenc
# @File    : net.py

import time
import torch
import torch.nn as nn
import math
from collections import OrderedDict
import torch.nn.functional


class Conv2d(nn.Module):
    def __init__(self, in_channels, out_channels, kernel_size, stride, padding, bias):
        super(Conv2d, self).__init__()

        self.conv = nn.Sequential(
                nn.Conv2d(in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=padding,
                          bias=bias),
                nn.BatchNorm2d(out_channels),
                nn.LeakyReLU(0.1)
        )

    def forward(self, x):
        return self.conv(x)


class BasicBlock(nn.Module):
    def __init__(self, inplanes, planes):
        super(BasicBlock, self).__init__()

        self.Conv1 = Conv2d(inplanes, planes[0], kernel_size=1, stride=1, padding=0, bias=False)
        self.Conv2 = Conv2d(planes[0], planes[1], kernel_size=3, stride=1, padding=1, bias=False)

    def forward(self, x):
        residual = x

        out = self.Conv1(x)
        out = self.Conv2(out)

        out += residual
        return out


class ConvSet(nn.Module):  # inplanes->inplanes
    def __init__(self, inplanes, outplanes):
        super(ConvSet, self).__init__()
        self.convset = nn.Sequential(
                Conv2d(inplanes, outplanes, 1, 1, 0, False),
                Conv2d(outplanes, outplanes, 3, 1, 1, False),
                Conv2d(outplanes, outplanes * 2, 1, 1, 0, False),
                Conv2d(outplanes * 2, outplanes * 2, 3, 1, 1, False),
                Conv2d(outplanes * 2, outplanes, 1, 1, 0, False)
        )

    def forward(self, x):
        return self.convset(x)


class Upsampling(nn.Module):
    def __init__(self):
        super(Upsampling, self).__init__()

    def forward(self, x):
        # interpolate 上采样专用函数，scale_factor放大倍数，mode插值模式
        return nn.functional.interpolate(x, scale_factor=2, mode='nearest')


class Yolo3Net(nn.Module):
    def __init__(self, class_nums ,layers = [1, 2, 8, 8, 4]):  # 默认使用darknet53
        super(Yolo3Net, self).__init__()
        self.class_nums = class_nums
        self.inplanes = 32
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(self.inplanes)
        self.relu1 = nn.LeakyReLU(0.1)

        self.layer1 = self._make_layer([32, 64], layers[0], 0)
        self.layer2 = self._make_layer([64, 128], layers[1], 1)
        self.layer3 = self._make_layer([128, 256], layers[2], 2)
        self.layer4 = self._make_layer([256, 512], layers[3], 3)
        self.layer5 = self._make_layer([512, 1024], layers[4], 4)

        self.convset_13 = ConvSet(1024, 512)
        self.delection_13 = nn.Sequential(
                Conv2d(512, 512, 3, 1, 1, False),
                # 3*(1+5) = 18 num_anchors = 3, num_classes = 1
                nn.Conv2d(512, 3 * (self.class_nums + 5), kernel_size=1, stride=1, padding=0, bias=False)
        )
        self.up13_to_26 = nn.Sequential(
                Conv2d(512, 256, 1, 1, 0, False),
                Upsampling()
        )

        self.convset_26 = ConvSet(768, 512)  # 512 + 256 = 768
        self.delection_26 = nn.Sequential(
                Conv2d(512, 512, 3, 1, 1, False),
                nn.Conv2d(512, 3 * (self.class_nums + 5), kernel_size=1, stride=1, padding=0, bias=False)
        )
        self.up26_to_52 = nn.Sequential(
                Conv2d(512, 256, 1, 1, 0, False),
                Upsampling()
        )

        self.convset_52 = ConvSet(512, 512)  # 256 + 256 = 512
        self.delection_52 = nn.Sequential(
                Conv2d(512, 512, 3, 1, 1, False),
                nn.Conv2d(512, 3 * (self.class_nums + 5), kernel_size=1, stride=1, padding=0, bias=False)
        )

        self.layers_out_filters = [64, 128, 256, 512, 1024]

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, math.sqrt(2. / n))
            elif isinstance(m, nn.BatchNorm2d):
                m.weight.data.fill_(1)
                m.bias.data.zero_()

    def _make_layer(self, planes, blocks, layer_num):
        layers = []

        # downsample
        layers.append(("ds_conv", nn.Conv2d(self.inplanes, planes[1], kernel_size=3, stride=2, padding=1, bias=False)))
        layers.append(("ds_bn", nn.BatchNorm2d(planes[1])))
        layers.append(("ds_relu", nn.LeakyReLU(0.1)))
        #  blocks
        self.inplanes = planes[1]
        for i in range(0, blocks):
            layers.append(("residual_{}".format(i), BasicBlock(self.inplanes, planes)))
        return nn.Sequential(OrderedDict(layers))

    def forward(self, x):
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu1(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x_52 = self.layer3(x)  # darknet 输出1
        x_26 = self.layer4(x_52)  # darknet 输出2
        x_13 = self.layer5(x_26)  # darknet 输出3

        x_13 = self.convset_13(x_13)
        out_13 = self.delection_13(x_13)  #  网络输出1

        x_13 = self.up13_to_26(x_13)

        x_26 = torch.cat((x_13, x_26), dim=1)  # 26 和 26 拼接
        x_26 = self.convset_26(x_26)
        out_26 = self.delection_26(x_26)  #  网络输出2

        x_26 = self.up26_to_52(x_26)

        # x_26.size() = torch.Size([?, 256, 52, 52])
        # x_52.size() = torch.Size([?, 256, 52, 52])
        x_52 = torch.cat((x_26, x_52), dim=1)  # dim = 1, 表示在第二个位置进行合成
        x_52 = self.convset_52(x_52)
        out_52 = self.delection_52(x_52)  #  网络输出3

        return out_52, out_26, out_13

if __name__ == '__main__':

    model = Yolo3Net(class_nums=5)
    print(model)
    model.eval()

    for i in range(2):
        t1 = time.time()
        x = torch.rand(1, 3, 416, 416)
        out3 = model(x)
        for out in out3:
            print(out.shape)
        cnt = time.time() - t1
        print(cnt)

运行测试

torch.Size([1, 30, 52, 52])
torch.Size([1, 30, 26, 26])
torch.Size([1, 30, 13, 13])
2.550520658493042
torch.Size([1, 30, 52, 52])
torch.Size([1, 30, 26, 26])
torch.Size([1, 30, 13, 13])
2.7060494422912598

参考文献

¹. yolov3 darknet53网络及mobilenet改进 ↩