I am trying implement YOLO v3 on custom dataset with a pretrained resnext101 backbone and connecting layer1, layer2 and layer3 for the YOLO object detection layers.
class MDENet(BaseModel):
# YOLOv3 object detection model
def __init__(self, yolo_props, path=None, features=256, non_negative=True, img_size=(416, 416), verbose=False):
super(MDENet, self).__init__()
use_pretrained = True if path is None else False
self.pretrained, self.scratch = _make_encoder(features, use_pretrained)
for param in self.pretrained.parameters():
param.requires_grad = False
# print(self.pretrained)
self.scratch.refinenet4 = FeatureFusionBlock(features)
self.scratch.refinenet3 = FeatureFusionBlock(features)
self.scratch.refinenet2 = FeatureFusionBlock(features)
self.scratch.refinenet1 = FeatureFusionBlock(features)
self.scratch.output_conv = nn.Sequential(
nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
Interpolate(scale_factor=2, mode="bilinear"),
nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
nn.ReLU(True),
nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
nn.ReLU(True) if non_negative else nn.Identity(),
)
if path:
self.load(path)
# YOLO head
conv_output = (int(yolo_props["num_classes"]) + 5) * int((len(yolo_props["anchors"]) / 3))
self.upsample1 = nn.Sequential(
nn.Conv2d(1024, 256, kernel_size=1),
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
)
self.upsample2 = nn.Sequential(
nn.Conv2d(512, 128, kernel_size=1),
nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
)
self.identity = nn.Identity()
# small objects
self.yolo1_learner = nn.Sequential(
nn.Conv2d(1024, 512, kernel_size=1),
nn.BatchNorm2d(512),
nn.ReLU(inplace=True),
nn.Conv2d(512, 1024, kernel_size=3),
nn.BatchNorm2d(1024),
nn.ReLU(inplace=True)
)
self.yolo1_reduce = nn.Conv2d(1024, conv_output, kernel_size=1, stride=1, padding=1)
self.yolo1 = YOLOLayer(yolo_props["anchors"][:3],
nc=int(yolo_props["num_classes"]),
img_size=img_size,
yolo_index=0,
layers=[],
stride=32)
# medium objects
self.yolo2_learner = nn.Sequential(
nn.Conv2d(768, 256, kernel_size=1, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, 512, kernel_size=3),
nn.BatchNorm2d(512),
nn.ReLU(inplace=True),
nn.Conv2d(512, 256, kernel_size=1, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, 512, kernel_size=3),
nn.BatchNorm2d(512),
nn.ReLU(inplace=True),
nn.Conv2d(512, 256, kernel_size=1, padding=1),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(256, 512, kernel_size=3),
nn.BatchNorm2d(512),
nn.ReLU(inplace=True)
)
self.yolo2_reduce = nn.Conv2d(512, conv_output, kernel_size=1, stride=1, padding=1)
self.yolo2 = YOLOLayer(yolo_props["anchors"][3:6],
nc=int(yolo_props["num_classes"]),
img_size=img_size,
yolo_index=1,
layers=[],
stride=16)
# large objects
self.yolo3_learner = nn.Sequential(
nn.Conv2d(384, 128, kernel_size=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 256, kernel_size=3),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(384, 128, kernel_size=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 256, kernel_size=3),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True),
nn.Conv2d(384, 128, kernel_size=1, padding=1),
nn.BatchNorm2d(128),
nn.ReLU(inplace=True),
nn.Conv2d(128, 256, kernel_size=3),
nn.BatchNorm2d(256),
nn.ReLU(inplace=True)
)
self.yolo3_reduce = nn.Conv2d(256, conv_output, kernel_size=1, stride=1, padding=1)
self.yolo3 = YOLOLayer(yolo_props["anchors"][6:],
nc=int(yolo_props["num_classes"]),
img_size=img_size,
yolo_index=1,
layers=[],
stride=8)
This is my forward() function:
def forward(self, x):
# Pretrained resnet101
layer_1 = self.pretrained.layer1(x)
layer_2 = self.pretrained.layer2(layer_1)
layer_3 = self.pretrained.layer3(layer_2)
layer_4 = self.pretrained.layer4(layer_3)
# Depth Detection
layer_1_rn = self.scratch.layer1_rn(layer_1)
layer_2_rn = self.scratch.layer2_rn(layer_2)
layer_3_rn = self.scratch.layer3_rn(layer_3)
layer_4_rn = self.scratch.layer4_rn(layer_4)
path_4 = self.scratch.refinenet4(layer_4_rn)
path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
path_1 = self.scratch.refinenet1(path_2, layer_1_rn)
depth_out = self.scratch.output_conv(path_1)
# Object Detection
# small objects
yolo1_out = self.yolo1(self.yolo1_reduce(self.yolo1_learner(layer_3)))
layer_3 = self.upsample1(layer_3)
layer_3 = torch.cat([layer_3, layer_2], dim=1)
print("layer_3.shape", layer_3.shape)
# medium objects
layer_3 = self.yolo2_learner(layer_3)
yolo2_out = self.yolo2(self.yolo2_reduce(layer_3))
layer_2 = self.upsample2(layer_3)
layer_2 = torch.cat([layer_1, layer_2], dim=1)
print("layer_2.shape", layer_2.shape)
# large objects
layer_2 = self.yolo3_learner(layer_2)
yolo3_out = self.yolo3(self.yolo3_reduce(layer_2))
yolo_out = [yolo1_out, yolo2_out, yolo3_out]
return depth_out, yolo_out
while training I get
Traceback (most recent call last):
File "train.py", line 471, in <module>
train() # train normally
File "train.py", line 296, in train
midas_out, yolo_out = model(imgs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/src/D/Research/EVA5-Vision-Squad/S15/torchutils/model/mde_net/mde_net.py", line 147, in forward
return self.forward_net(x)
File "/src/D/Research/EVA5-Vision-Squad/S15/torchutils/model/mde_net/mde_net.py", line 195, in forward_net
midas_out, yolo_out = self.run_batch(x)
File "/src/D/Research/EVA5-Vision-Squad/S15/torchutils/model/mde_net/mde_net.py", line 268, in run_batch
layer_2 = self.yolo3_learner(layer_2)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/container.py", line 100, in forward
input = module(input)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 353, in forward
return self._conv_forward(input, self.weight)
File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 350, in _conv_forward
self.padding, self.dilation, self.groups)
RuntimeError: Given groups=1, weight of size [128, 384, 1, 1], expected input[8, 256, 104, 104] to have 384 channels, but got 256 channels instead
at layer_2 = self.yolo3_learner(layer_2). self.yolo3_learner() expects 384 input channels. but when I print the layer_2.shape it results the tensor has same 384 channels.
layer_2.shape torch.Size([8, 384, 104, 104])
Am I doing anything wrong in the Conv2d or in torch.cat??