Conv2d acting weird on torch.cat output

I am trying implement YOLO v3 on custom dataset with a pretrained resnext101 backbone and connecting layer1, layer2 and layer3 for the YOLO object detection layers.

class MDENet(BaseModel):
    # YOLOv3 object detection model

    def __init__(self, yolo_props, path=None, features=256, non_negative=True, img_size=(416, 416), verbose=False):
        
        super(MDENet, self).__init__()

        use_pretrained = True if path is None else False

        self.pretrained, self.scratch = _make_encoder(features, use_pretrained)
        
        
        for param in self.pretrained.parameters():
            param.requires_grad = False
        
        # print(self.pretrained)
        
        self.scratch.refinenet4 = FeatureFusionBlock(features)
        self.scratch.refinenet3 = FeatureFusionBlock(features)
        self.scratch.refinenet2 = FeatureFusionBlock(features)
        self.scratch.refinenet1 = FeatureFusionBlock(features)

        self.scratch.output_conv = nn.Sequential(
            nn.Conv2d(features, 128, kernel_size=3, stride=1, padding=1),
            Interpolate(scale_factor=2, mode="bilinear"),
            nn.Conv2d(128, 32, kernel_size=3, stride=1, padding=1),
            nn.ReLU(True),
            nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
            nn.ReLU(True) if non_negative else nn.Identity(),
        )

        if path:
            self.load(path)
        
        # YOLO head
        conv_output = (int(yolo_props["num_classes"]) + 5) * int((len(yolo_props["anchors"]) / 3))
        self.upsample1 = nn.Sequential(
            nn.Conv2d(1024, 256, kernel_size=1),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        )
        self.upsample2 = nn.Sequential(
            nn.Conv2d(512, 128, kernel_size=1),
            nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
        )
        self.identity = nn.Identity()
        
        # small objects
        self.yolo1_learner = nn.Sequential(
            nn.Conv2d(1024, 512, kernel_size=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 1024, kernel_size=3),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True)
        )
        self.yolo1_reduce = nn.Conv2d(1024, conv_output, kernel_size=1, stride=1, padding=1)
        self.yolo1 = YOLOLayer(yolo_props["anchors"][:3],
                               nc=int(yolo_props["num_classes"]),
                               img_size=img_size,
                               yolo_index=0,
                               layers=[],
                               stride=32)
        
        # medium objects
        self.yolo2_learner = nn.Sequential(
            nn.Conv2d(768, 256, kernel_size=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 512, kernel_size=3),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 256, kernel_size=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 512, kernel_size=3),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True),
            nn.Conv2d(512, 256, kernel_size=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 512, kernel_size=3),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True)
        )
        self.yolo2_reduce = nn.Conv2d(512, conv_output, kernel_size=1, stride=1, padding=1)
        self.yolo2 = YOLOLayer(yolo_props["anchors"][3:6],
                               nc=int(yolo_props["num_classes"]),
                               img_size=img_size,
                               yolo_index=1,
                               layers=[],
                               stride=16)
        
        
        # large objects
        self.yolo3_learner = nn.Sequential(
            nn.Conv2d(384, 128, kernel_size=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=3),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 128, kernel_size=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=3),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(384, 128, kernel_size=1, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 256, kernel_size=3),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        
        self.yolo3_reduce = nn.Conv2d(256, conv_output, kernel_size=1, stride=1, padding=1)
        self.yolo3 = YOLOLayer(yolo_props["anchors"][6:],
                               nc=int(yolo_props["num_classes"]),
                               img_size=img_size,
                               yolo_index=1,
                               layers=[],
                               stride=8)

This is my forward() function:

 def forward(self, x):
        
        # Pretrained resnet101
        layer_1 = self.pretrained.layer1(x)
        layer_2 = self.pretrained.layer2(layer_1)
        layer_3 = self.pretrained.layer3(layer_2)
        layer_4 = self.pretrained.layer4(layer_3)
        
        # Depth Detection
        layer_1_rn = self.scratch.layer1_rn(layer_1)
        layer_2_rn = self.scratch.layer2_rn(layer_2)
        layer_3_rn = self.scratch.layer3_rn(layer_3)
        layer_4_rn = self.scratch.layer4_rn(layer_4)

        path_4 = self.scratch.refinenet4(layer_4_rn)
        path_3 = self.scratch.refinenet3(path_4, layer_3_rn)
        path_2 = self.scratch.refinenet2(path_3, layer_2_rn)
        path_1 = self.scratch.refinenet1(path_2, layer_1_rn)

        depth_out = self.scratch.output_conv(path_1)
        
        # Object Detection
        # small objects
        yolo1_out = self.yolo1(self.yolo1_reduce(self.yolo1_learner(layer_3)))
        
        layer_3 = self.upsample1(layer_3)
        layer_3 = torch.cat([layer_3, layer_2], dim=1)
        print("layer_3.shape", layer_3.shape)

        # medium objects
        layer_3 = self.yolo2_learner(layer_3)
        yolo2_out = self.yolo2(self.yolo2_reduce(layer_3))
        
        layer_2 = self.upsample2(layer_3)
        layer_2 = torch.cat([layer_1, layer_2], dim=1)
        print("layer_2.shape", layer_2.shape)

        # large objects
        layer_2 = self.yolo3_learner(layer_2)
        yolo3_out = self.yolo3(self.yolo3_reduce(layer_2))
        
        yolo_out = [yolo1_out, yolo2_out, yolo3_out]
        
        return depth_out, yolo_out

while training I get

Traceback (most recent call last):
  File "train.py", line 471, in <module>
    train()  # train normally
  File "train.py", line 296, in train
    midas_out, yolo_out = model(imgs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/src/D/Research/EVA5-Vision-Squad/S15/torchutils/model/mde_net/mde_net.py", line 147, in forward
    return self.forward_net(x)
  File "/src/D/Research/EVA5-Vision-Squad/S15/torchutils/model/mde_net/mde_net.py", line 195, in forward_net
    midas_out, yolo_out = self.run_batch(x)
  File "/src/D/Research/EVA5-Vision-Squad/S15/torchutils/model/mde_net/mde_net.py", line 268, in run_batch
    layer_2 = self.yolo3_learner(layer_2)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/container.py", line 100, in forward
    input = module(input)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py", line 550, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 353, in forward
    return self._conv_forward(input, self.weight)
  File "/usr/local/lib/python3.6/dist-packages/torch/nn/modules/conv.py", line 350, in _conv_forward
    self.padding, self.dilation, self.groups)
RuntimeError: Given groups=1, weight of size [128, 384, 1, 1], expected input[8, 256, 104, 104] to have 384 channels, but got 256 channels instead

at layer_2 = self.yolo3_learner(layer_2). self.yolo3_learner() expects 384 input channels. but when I print the layer_2.shape it results the tensor has same 384 channels.

layer_2.shape torch.Size([8, 384, 104, 104])

Am I doing anything wrong in the Conv2d or in torch.cat??

yolo3_learner is using a wrong layer config, i.e. the third nn.Conv2d layer expects 384 input channels, while the preceding one outputs 256 channels.
The same issue is in the 5th conv layer.

1 Like

Thanks a lot !!! it is difficult to debug in big networks.