Differing behaviours between my two (seemingly identical) Sequential modules

I have been messing around with the base VGG16 model trying to improve the performance and I ended up changing the ‘features’ module such that if I print it out it shows as such when I print it out:

(features): Sequential(
    (0): Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (16): ReLU(inplace=True)
    (17): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU(inplace=True)
    (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): ReLU(inplace=True)
    (21): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (22): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (23): ReLU(inplace=True)
    (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (25): ReLU(inplace=True)
    (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): ReLU(inplace=True)
    (28): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )

To get the above I made the following edits in my VGG class:

class Vgg(nn.Module):
    
    def __init__(self, n_classes, bias= None, dropout = 0.3):
        
        super().__init__()
        self.model = models.vgg16()

        # change to allow 4 channels input
        self.model.features[0] = nn.Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
        del self.model.features[2]
        del self.model.features[2]
        self.model.features[2] = nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)

        # change the pooling layer
        self.model.avgpool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        
        # set output to correct num classes
        self.model.classifier = nn.Linear(in_features=512, out_features=n_classes, bias=True)

    def forward(self, x):
        x = self.model.features(x)
        x = self.model.avgpool(x)
        x = torch.squeeze(x)
        x = self.model.classifier(x)

        return x

So when I run this code it works fine. But I tried to re-write it in a more straightforward, easy to follow manner, as such:

class Vgg2(nn.Module):
    
    def __init__(self, n_classes, bias= None, dropout = 0.3):
        
        super().__init__()
            
        self.dropout = nn.Dropout(p=dropout)
        self.r = nn.ReLU(inplace=True)
        
        modules = nn.Sequential(nn.Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3)),
                                #self.bn64,
                                nn.ReLU(inplace=True),
                                nn.MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False),
                                
                                nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn128,
                                nn.ReLU(inplace=True),
                                nn.Conv2d(128, 128, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn128,
                                nn.ReLU(inplace=True),
                                #self.dropout,
                                nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
                                
                                nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn256,
                                nn.ReLU(inplace=True),
                                nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn256,
                                nn.ReLU(inplace=True),
                                nn.Conv2d(256, 256, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn256,
                                nn.ReLU(inplace=True),
                                #self.dropout,
                                nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
                                
                                nn.Conv2d(256, 512, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn512,
                                nn.ReLU(inplace=True),
                                nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn512,
                                nn.ReLU(inplace=True),
                                nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn512,
                                nn.ReLU(inplace=True),
                                #self.dropout,
                                nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False),
                                
                                nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn512,
                                nn.ReLU(inplace=True),
                                nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn512,
                                nn.ReLU(inplace=True),
                                nn.Conv2d(512, 512, kernel_size=3, stride=1, padding=1, bias=True),
                                #self.bn512,
                                nn.ReLU(inplace=True),
                                #self.dropout,
                                nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
                                )
        
        self.get_feat = modules
        
        # change the pooling layer
        self.pool = nn.AdaptiveAvgPool2d(output_size=(1, 1))
        
        # set output to correct num classes
        self.classify = nn.Linear(in_features=512, out_features=n_classes, bias=True)

        
    def forward(self, x):
        
        x = self.get_feat(x)
        x = self.pool(x)
        x = torch.squeeze(x)
        x = self.classify(x)
        
        return x

However, when I try to train using this second method, the model does not learn at all, all predictions remain zero.

The only difference I can see between the two is the index numbering of the model.features of the Sequential module assigned to features:

With Vgg (1st method):

(features): Sequential(
      (0): Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
      (1): ReLU(inplace=True)
      (4): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
      (5): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (6): ReLU(inplace=True)
      (7): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (8): ReLU(inplace=True)
      (9): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (10): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (11): ReLU(inplace=True)
      (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (13): ReLU(inplace=True)
      (14): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (15): ReLU(inplace=True)
      (16): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (17): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (18): ReLU(inplace=True)
      (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (20): ReLU(inplace=True)
      (21): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (22): ReLU(inplace=True)
      (23): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
      (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (25): ReLU(inplace=True)
      (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (27): ReLU(inplace=True)
      (28): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
      (29): ReLU(inplace=True)
      (30): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    )

And with Vgg2 (2nd method):

(get_feat): Sequential(
    (0): Conv2d(4, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3))
    (1): ReLU(inplace=True)
    (2): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (3): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): ReLU(inplace=True)
    (5): Conv2d(128, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): ReLU(inplace=True)
    (10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (11): ReLU(inplace=True)
    (12): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (13): ReLU(inplace=True)
    (14): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (15): Conv2d(256, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (16): ReLU(inplace=True)
    (17): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (18): ReLU(inplace=True)
    (19): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (20): ReLU(inplace=True)
    (21): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (22): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (23): ReLU(inplace=True)
    (24): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (25): ReLU(inplace=True)
    (26): Conv2d(512, 512, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (27): ReLU(inplace=True)
    (28): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )

The only difference I see is that in the first method, the indexes of the ‘features’ goes 0, 1, 4, 5, … (because of the layers which I deleted from the original list) whereas the 2nd method#s indices go in order (0,1,2,3,4,5…). But I don’t see how this would make any difference, but it is really the only difference I can see.

Maybe someone else can see where I am going wrong?

It might be useful to check at which layer the output of the sequential module stops making sense. Can you try starting from a single layer in the sequential module, inspecting the output (e.g., just printing the output tensor), and adding more layers progressively?