Adjusting ResNet to output the desired dimensions

I am currently using VGG16 feature layers (only first 14 layers) for input to the rest of my network. My CNN works with depth of 128 so I also added two convolutions (512 -> 256 and 256 -> 128) to VGG16 feature layers to fit the depth.

I’d also like to try out ResNet as feature input to my CNN, but the problem is that ResNet (e.g ResNet50) resizes down the image for the factor of 32 which is too small for the nature of my problem. I tried using ConvTranspose2d to upsample output and increase image size and then decrease depth of ResNet. This is the current solution

import torch.nn as nn
from torchvision import models
from torch import cat  

class Resnet50(nn.Module):
    def __init__(self):
        super().__init__()

        self.resnet = models.resnet50(pretrained=True)

        self.upsample_1 = nn.Sequential(
            nn.ConvTranspose2d(2048, 1024, 4, stride=2, padding=1),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True))
        self.combining_1 = nn.Sequential(
            nn.Conv2d(2048, 1024, 1),
            nn.BatchNorm2d(1024),
            nn.ReLU(inplace=True))

        self.upsample_2 = nn.Sequential(
            nn.ConvTranspose2d(1024, 512, 4, stride=2, padding=1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True))
        self.combining_2 = nn.Sequential(
            nn.Conv2d(1024, 512, 1),
            nn.BatchNorm2d(512),
            nn.ReLU(inplace=True))

        self.decrease_depth_1 = nn.Sequential(
            nn.Conv2d(512, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True))
        self.decrease_depth_2 = nn.Sequential(
            nn.Conv2d(256, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True))

    def forward(self, batch):
        rn = self.resnet

        x = batch

        x = rn.conv1(x)
        x = rn.bn1(x)
        x = rn.relu(x)
        x = rn.maxpool(x)

        x = rn.layer1(x)

        x = rn.layer2(x)
        out_l2 = x

        x = rn.layer3(x)
        out_l3 = x

        x = rn.layer4(x)

        x = self.upsample_1(x)
        x = cat((x, out_l3), 1)

        x = self.combining_1(x)
        x = self.upsample_2(x)

        x = cat((x, out_l2), 1)
        x = self.combining_2(x)

        x = self.decrease_depth_1(x)
        x = self.decrease_depth_2(x)

        return x

However, I get weird outputs at the end of my CNN. I just want to know if this is a somewhat sensible solution or have I completely missed my mark?

1 Like