Unverified Commit 9e8d025b authored by Ryan Li's avatar Ryan Li Committed by GitHub
Browse files

Merge pull request #3824 from RyanXLi/yolov4_network

added yolo v4 network
parents 9caa197f d535cdf5
......@@ -5,7 +5,8 @@ model = dict(
pretrained='open-mmlab://darknet53',
backbone=dict(type='Darknet', depth=53, out_indices=(3, 4, 5)),
neck=dict(
type='YOLOV3Neck',
type='YOLONeck',
yolo_version='v3',
num_scales=3,
in_channels=[1024, 512, 256],
out_channels=[512, 256, 128]),
......
......@@ -2,6 +2,7 @@
import logging
import torch
import torch.nn as nn
from mmcv.cnn import ConvModule, constant_init, kaiming_init
from mmcv.runner import load_checkpoint
......@@ -20,6 +21,8 @@ class ResBlock(nn.Module):
Args:
in_channels (int): The input channels. Must be even.
yolo_version (str): The version of YOLO to build, must be 'v3' or 'v4',
Default: 'v3'
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
......@@ -29,19 +32,25 @@ class ResBlock(nn.Module):
def __init__(self,
in_channels,
yolo_version='v3',
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
super(ResBlock, self).__init__()
assert in_channels % 2 == 0 # ensure the in_channels is even
half_in_channels = in_channels // 2
if yolo_version not in ('v3', 'v4'):
raise NotImplementedError('Only YOLO v3 and v4 are supported.')
# shortcut
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
self.conv1 = ConvModule(in_channels, half_in_channels, 1, **cfg)
self.conv2 = ConvModule(
half_in_channels, in_channels, 3, padding=1, **cfg)
if yolo_version == 'v3':
assert in_channels % 2 == 0 # ensure the in_channels is even
mid_channels = in_channels // 2
else:
mid_channels = in_channels
self.conv1 = ConvModule(in_channels, mid_channels, 1, **cfg)
self.conv2 = ConvModule(mid_channels, in_channels, 3, padding=1, **cfg)
def forward(self, x):
residual = x
......@@ -61,6 +70,9 @@ class Darknet(nn.Module):
out_indices (Sequence[int]): Output from which stages.
frozen_stages (int): Stages to be frozen (stop grad and set eval mode).
-1 means not freezing any parameters. Default: -1.
with_csp (bool): Whether the Darknet uses csp (cross stage partial
network). This is a feature of YOLO v4, see details at
`https://arxiv.org/abs/1911.11929`_ Default: False.
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
......@@ -95,6 +107,7 @@ class Darknet(nn.Module):
depth=53,
out_indices=(3, 4, 5),
frozen_stages=-1,
with_csp=False,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1),
......@@ -115,9 +128,13 @@ class Darknet(nn.Module):
for i, n_layers in enumerate(self.layers):
layer_name = f'conv_res_block{i + 1}'
in_c, out_c = self.channels[i]
self.add_module(
layer_name,
self.make_conv_res_block(in_c, out_c, n_layers, **cfg))
if with_csp:
conv_module = CspResBlock(
in_c, out_c, n_layers, is_first_block=(i == 0), **cfg)
else:
conv_module = self.make_conv_res_block(in_c, out_c, n_layers,
**cfg)
self.add_module(layer_name, conv_module)
self.cr_blocks.append(layer_name)
self.norm_eval = norm_eval
......@@ -197,3 +214,68 @@ class Darknet(nn.Module):
model.add_module('res{}'.format(idx),
ResBlock(out_channels, **cfg))
return model
class CspResBlock(nn.Module):
"""This class makes the conv_res_block in YOLO v4. It has CSP integrated,
hence different from the regular conv_res_block build with
`make_conv_res_block`.
Args:
in_channels (int): The number of input channels.
out_channels (int): The number of output channels.
res_repeat (int): The number of ResBlocks.
is_first_block (bool): Whether the CspResBlock is the
first in the Darknet. This affects the structure of the
block. Default: False,
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
"""
def __init__(self,
in_channels,
out_channels,
res_repeat,
is_first_block=False,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
super(CspResBlock, self).__init__()
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
bottleneck_channels = out_channels if is_first_block else in_channels
self.preconv = ConvModule(
in_channels, out_channels, 3, stride=2, padding=1, **cfg)
self.shortconv = ConvModule(
out_channels, bottleneck_channels, 1, stride=1, **cfg)
self.mainconv = ConvModule(
out_channels, bottleneck_channels, 1, stride=1, **cfg)
self.blocks = nn.Sequential()
for idx in range(res_repeat):
if is_first_block:
self.blocks.add_module('res{}'.format(idx),
ResBlock(bottleneck_channels, **cfg))
else:
self.blocks.add_module(
'res{}'.format(idx),
ResBlock(bottleneck_channels, yolo_version='v4', **cfg))
self.postconv = ConvModule(
bottleneck_channels, bottleneck_channels, 1, stride=1, **cfg)
self.finalconv = ConvModule(
2 * bottleneck_channels, out_channels, 1, stride=1, **cfg)
def forward(self, x):
x = self.preconv(x)
x_short = self.shortconv(x)
x_main = self.mainconv(x)
x_main = self.blocks(x_main)
x_main = self.postconv(x_main)
x_final = torch.cat((x_main, x_short), 1)
x_final = self.finalconv(x_final)
return x_final
......@@ -21,12 +21,12 @@ from .rpn import RPN
from .single_stage import SingleStageDetector
from .two_stage import TwoStageDetector
from .yolact import YOLACT
from .yolo import YOLOV3
from .yolo import YOLOV3, YOLOV4
__all__ = [
'ATSS', 'BaseDetector', 'SingleStageDetector', 'TwoStageDetector', 'RPN',
'FastRCNN', 'FasterRCNN', 'MaskRCNN', 'CascadeRCNN', 'HybridTaskCascade',
'RetinaNet', 'FCOS', 'GridRCNN', 'MaskScoringRCNN', 'RepPointsDetector',
'FOVEA', 'FSAF', 'NASFCOS', 'PointRend', 'GFL', 'CornerNet', 'PAA',
'YOLOV3', 'YOLACT'
'YOLACT', 'YOLOV3', 'YOLOV4'
]
......@@ -16,3 +16,17 @@ class YOLOV3(SingleStageDetector):
pretrained=None):
super(YOLOV3, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
@DETECTORS.register_module()
class YOLOV4(SingleStageDetector):
def __init__(self,
backbone,
neck,
bbox_head,
train_cfg=None,
test_cfg=None,
pretrained=None):
super(YOLOV4, self).__init__(backbone, neck, bbox_head, train_cfg,
test_cfg, pretrained)
......@@ -6,9 +6,9 @@ from .nas_fpn import NASFPN
from .nasfcos_fpn import NASFCOS_FPN
from .pafpn import PAFPN
from .rfp import RFP
from .yolo_neck import YOLOV3Neck
from .yolo_neck import YOLONeck
__all__ = [
'FPN', 'BFP', 'HRFPN', 'NASFPN', 'FPN_CARAFE', 'PAFPN', 'NASFCOS_FPN',
'RFP', 'YOLOV3Neck'
'RFP', 'YOLONeck'
]
......@@ -11,20 +11,21 @@ from ..builder import NECKS
class DetectionBlock(nn.Module):
"""Detection block in YOLO neck.
Let out_channels = n, the DetectionBlock contains:
Six ConvLayers, 1 Conv2D Layer and 1 YoloLayer.
The first 6 ConvLayers are formed the following way:
1x1xn, 3x3x2n, 1x1xn, 3x3x2n, 1x1xn, 3x3x2n.
The Conv2D layer is 1x1x255.
Some block will have branch after the fifth ConvLayer.
Let out_channels = n, the DetectionBlock normally contains 5 ConvModules,
Their sizes are 1x1xn, 3x3x2n, 1x1xn, 3x3x2n, and 1x1xn respectively.
If the spp is on, the DetectionBlock contains 6 ConvModules and
3 pooling layers, sizes are 1x1xn, 3x3x2n, 1x1xn,
5x5 maxpool, 9x9 maxpool, 13x13 maxpool, 1x1xn, 3x3x2n, 1x1xn.
The input channel is arbitrary (in_channels)
Args:
in_channels (int): The number of input channels.
out_channels (int): The number of output channels.
conv_cfg (dict): Config dict for convolution layer. Default: None.
spp_scales (tuple | None): A set of sizes for spatial pyramid pooling.
When set to None, the spp is disabled. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
Default: dict(type='BN', requires_grad=True).
act_cfg (dict): Config dict for activation layer.
Default: dict(type='LeakyReLU', negative_slope=0.1).
"""
......@@ -32,10 +33,12 @@ class DetectionBlock(nn.Module):
def __init__(self,
in_channels,
out_channels,
spp_scales=None,
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
super(DetectionBlock, self).__init__()
self.spp_on = spp_scales is not None
double_out_channels = out_channels * 2
# shortcut
......@@ -44,6 +47,15 @@ class DetectionBlock(nn.Module):
self.conv2 = ConvModule(
out_channels, double_out_channels, 3, padding=1, **cfg)
self.conv3 = ConvModule(double_out_channels, out_channels, 1, **cfg)
if self.spp_on:
self.poolers = [
nn.MaxPool2d(size, 1, padding=(size - 1) // 2)
for size in spp_scales
]
self.conv_spp = ConvModule(out_channels * (len(spp_scales) + 1),
out_channels, 1, **cfg)
self.conv4 = ConvModule(
out_channels, double_out_channels, 3, padding=1, **cfg)
self.conv5 = ConvModule(double_out_channels, out_channels, 1, **cfg)
......@@ -52,29 +64,40 @@ class DetectionBlock(nn.Module):
tmp = self.conv1(x)
tmp = self.conv2(tmp)
tmp = self.conv3(tmp)
if self.spp_on:
spp_feats = [tmp] + [pooler(tmp) for pooler in self.poolers]
tmp = torch.cat(spp_feats[::-1], 1)
tmp = self.conv_spp(tmp)
tmp = self.conv4(tmp)
out = self.conv5(tmp)
return out
@NECKS.register_module()
class YOLOV3Neck(nn.Module):
"""The neck of YOLOV3.
class YOLONeck(nn.Module):
"""The neck of YOLO v3 or v4.
It can be treated as a simplified version of FPN. It
will take the result from Darknet backbone and do some upsampling and
concatenation. It will finally output the detection result.
It can be regarded as a simplified version of FPN.
It takes the feature maps from different levels of the Darknet backbone,
After some upsampling and concatenation, it outputs a set of
new feature maps which are passed to the head of YOLO.
Note:
The input feats should be from top to bottom.
i.e., from high-lvl to low-lvl
But YOLOV3Neck will process them in reversed order.
But YOLONeck will process them in reversed order.
i.e., from bottom (high-lvl) to top (low-lvl)
Args:
num_scales (int): The number of scales / stages.
in_channels (int): The number of input channels.
out_channels (int): The number of output channels.
spp_scales (tuple): A set of sizes for spatial pyramid pooling.
When set to None, the spp is disabled. Default: None.
yolo_version (str): The version of YOLO to build, must be 'v3' or 'v4',
Default: 'v3'
conv_cfg (dict): Config dict for convolution layer. Default: None.
norm_cfg (dict): Dictionary to construct and config norm layer.
Default: dict(type='BN', requires_grad=True)
......@@ -86,27 +109,69 @@ class YOLOV3Neck(nn.Module):
num_scales,
in_channels,
out_channels,
spp_scales=None,
yolo_version='v3',
conv_cfg=None,
norm_cfg=dict(type='BN', requires_grad=True),
act_cfg=dict(type='LeakyReLU', negative_slope=0.1)):
super(YOLOV3Neck, self).__init__()
super(YOLONeck, self).__init__()
if yolo_version not in ('v3', 'v4'):
raise NotImplementedError('Only YOLO v3 and v4 are supported.')
assert (num_scales == len(in_channels) == len(out_channels))
self.num_scales = num_scales
self.in_channels = in_channels
self.out_channels = out_channels
self.spp_on = spp_scales is not None
self.yolo_version = yolo_version
# shortcut
cfg = dict(conv_cfg=conv_cfg, norm_cfg=norm_cfg, act_cfg=act_cfg)
# If spp is enabled, the first DetectionBlock is built with a spp
# module inserted into it and the other DetectionBlock stays unchanged.
self.detect1 = DetectionBlock(self.in_channels[0],
self.out_channels[0], spp_scales, **cfg)
# To support arbitrary scales, the code looks awful, but it works.
# Better solution is welcomed.
self.detect1 = DetectionBlock(in_channels[0], out_channels[0], **cfg)
for i in range(1, self.num_scales):
in_c, out_c = self.in_channels[i], self.out_channels[i]
self.add_module(f'conv{i}', ConvModule(in_c, out_c, 1, **cfg))
# in_c + out_c : High-lvl feats will be cat with low-lvl feats
self.add_module(f'detect{i+1}',
DetectionBlock(in_c + out_c, out_c, **cfg))
if yolo_version == 'v3':
for i in range(1, self.num_scales):
in_c, out_c = self.in_channels[i], self.out_channels[i]
self.add_module(f'conv{i}', ConvModule(in_c, out_c, 1, **cfg))
# in_c + out_c : High-lvl feats will be cat with low-lvl feats
self.add_module(f'detect{i+1}',
DetectionBlock(in_c + out_c, out_c, **cfg))
else: # YOLO v4
for i in range(1, self.num_scales):
in_c, out_c = self.in_channels[i], self.out_channels[i]
self.add_module(f'upsample_conv{i}',
ConvModule(in_c, out_c, 1, **cfg))
self.add_module(f'feat_conv{i}',
ConvModule(in_c, out_c, 1, **cfg))
# note: here YOLO v4 is different than YOLO v3
# the in channels changed from in_c + out_c to in_c
# because of the newly added feat_conv
self.add_module(f'detect{i + 1}',
DetectionBlock(in_c, out_c, **cfg))
# downsampling PANet path
# e.g. If the num_scales is 3 (as in original YOLO V4), i will be
# 3 and 4, downsample (ds) will use channels from idx 2 and 1,
# detection block (det) will use channels from idx 1 and 0
for i in range(self.num_scales, self.num_scales * 2 - 1):
ds_channel_idx = self.num_scales * 2 - 1 - i
ds_in_c = self.out_channels[ds_channel_idx]
ds_out_c = self.in_channels[ds_channel_idx]
det_channel_idx = ds_channel_idx - 1
det_in_c = self.in_channels[det_channel_idx]
det_out_c = self.out_channels[det_channel_idx]
self.add_module(
f'downsample_conv{i - self.num_scales + 1}',
ConvModule(
ds_in_c, ds_out_c, 3, stride=2, padding=1, **cfg))
self.add_module(f'detect{i + 1}',
DetectionBlock(det_in_c, det_out_c, **cfg))
def forward(self, feats):
assert len(feats) == self.num_scales
......@@ -116,17 +181,43 @@ class YOLOV3Neck(nn.Module):
out = self.detect1(feats[-1])
outs.append(out)
for i, x in enumerate(reversed(feats[:-1])):
conv = getattr(self, f'conv{i+1}')
tmp = conv(out)
# Cat with low-lvl feats
tmp = F.interpolate(tmp, scale_factor=2)
tmp = torch.cat((tmp, x), 1)
detect = getattr(self, f'detect{i+2}')
out = detect(tmp)
outs.append(out)
if self.yolo_version == 'v3':
for i, x in enumerate(reversed(feats[:-1])):
conv = getattr(self, f'conv{i+1}')
tmp = conv(out)
# Cat with low-lvl feats
tmp = F.interpolate(tmp, scale_factor=2)
tmp = torch.cat((tmp, x), 1)
detect = getattr(self, f'detect{i+2}')
out = detect(tmp)
outs.append(out)
else: # YOLO v4
for i, x in enumerate(reversed(feats[:-1])):
upsample_conv = getattr(self, f'upsample_conv{i + 1}')
tmp = upsample_conv(out)
feat_conv = getattr(self, f'feat_conv{i + 1}')
tmp_x = feat_conv(x)
# Cat with low-lvl feats
tmp = F.interpolate(tmp, scale_factor=2)
tmp = torch.cat((tmp_x, tmp), 1)
detect = getattr(self, f'detect{i + 2}')
out = detect(tmp)
outs.append(out)
cur_feat = outs[-1]
for i in range(self.num_scales - 1):
downsample_conv = getattr(self, f'downsample_conv{i + 1}')
tmp = downsample_conv(cur_feat)
tmp = torch.cat((tmp, outs[-2 - i]), 1)
detect = getattr(self, f'detect{i + self.num_scales + 1}')
cur_feat = detect(tmp)
outs[-2 - i] = cur_feat
return tuple(outs)
......
......@@ -4,7 +4,8 @@ from mmcv.ops import DeformConv2dPack
from torch.nn.modules import AvgPool2d, GroupNorm
from torch.nn.modules.batchnorm import _BatchNorm
from mmdet.models.backbones import RegNet, Res2Net, ResNet, ResNetV1d, ResNeXt
from mmdet.models.backbones import (Darknet, RegNet, Res2Net, ResNet,
ResNetV1d, ResNeXt)
from mmdet.models.backbones.hourglass import HourglassNet
from mmdet.models.backbones.res2net import Bottle2neck
from mmdet.models.backbones.resnet import BasicBlock, Bottleneck
......@@ -854,3 +855,33 @@ def test_hourglass_backbone():
assert len(feat) == 2
assert feat[0].shape == torch.Size([1, 256, 128, 128])
assert feat[1].shape == torch.Size([1, 256, 128, 128])
def test_darknet_backbone():
with pytest.raises(KeyError):
# Darknet depth should be 53
Darknet(depth=50)
# Test Darknet v3
model = Darknet(depth=53, with_csp=False)
model.init_weights()
model.train()
imgs = torch.randn(1, 3, 416, 416)
feat = model(imgs)
assert len(feat) == 3
assert feat[0].shape == torch.Size([1, 256, 52, 52])
assert feat[1].shape == torch.Size([1, 512, 26, 26])
assert feat[2].shape == torch.Size([1, 1024, 13, 13])
# Test Darknet v4
model = Darknet(depth=53, with_csp=True)
model.init_weights()
model.train()
imgs = torch.randn(1, 3, 416, 416)
feat = model(imgs)
assert len(feat) == 3
assert feat[0].shape == torch.Size([1, 256, 52, 52])
assert feat[1].shape == torch.Size([1, 512, 26, 26])
assert feat[2].shape == torch.Size([1, 1024, 13, 13])
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment