Problem with different ways of defining models

I was defining the image classification model using two different formats (CNN1 and CNN2) as below. I think the two models should be identical (I also checked they have the same number of parameters by calling self.count_parameters()). While CNN1 can be trained normally, CNN2 encounters the problem of not being able to reduce the losses. (Note: When training, except for the difference between these two definitions, all other codes remain the same)
So, I would like to ask what am I missing in my definition of CNN2? Why the program does not return an error but the model cannot be trained?

class CNN1(nn.Module):
    def __init__(self):
        super(CNN1, self).__init__()
        
        self.network = nn.Sequential(
            # (batch, 3, 256, 256)
            nn.Conv2d(in_channels=3, out_channels=32, kernel_size=3, padding=1), # (batch, 32, 256, 256)
            nn.ReLU(),
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, padding=1, stride=2), # (batch, 64, 128, 128)
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # (batch, 64, 64, 64)
            
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, padding=1), # (batch, 128, 64, 64)
            nn.ReLU(),
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2), # (batch, 256, 32, 32)
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # (batch, 256, 16, 16)
            
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1), # (batch, 256, 16, 16)
            nn.ReLU(),
            nn.Conv2d(in_channels=256, out_channels=256, kernel_size=3, padding=1), # (batch, 512, 16, 16)
            nn.ReLU(),
            nn.MaxPool2d(2, 2), # (batch, 256, 8, 8)      

            nn.Flatten(), # (batch, 256*8*8)
            nn.Linear(in_features=256*8*8, out_features=4096),
            nn.ReLU(),
            nn.Linear(in_features=4096, out_features=1024),
            nn.ReLU(),          
            nn.Linear(in_features=1024, out_features=512),
            nn.ReLU(),
            nn.Linear(in_features=512, out_features=2)
        )
    
    def forward(self, xb):
        return self.network(xb)

    def count_parameters(self):
        return sum(p.numel() for p in self.parameters() if p.requires_grad)

Number of parameters: 73402690
Output (In each training step, losses are decreasing, training is normal):
val_loss: 0.5657 val_roc: 0.7292
val_loss: 0.5432 val_roc: 0.9091
val_loss: 0.6879 val_roc: 0.7308
val_loss: 0.6001 val_roc: 0.7292
val_loss: 0.6052 val_roc: 0.6970
val_loss: 0.5604 val_roc: 0.7500
val_loss: 0.5020 val_roc: 0.7747
val_loss: 0.6042 val_roc: 0.5500

class CNN2(nn.Module):
    def __init__(self):
        super(CNN2, self).__init__()

        self.conv1 = self.conv_block(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = self.conv_block(in_channels=32, out_channels=64, kernel_size=3, padding=1, stride=2, pool=True)

        self.conv3 = self.conv_block(in_channels=64, out_channels=128, kernel_size=3, padding=1)
        self.conv4 = self.conv_block(in_channels=128, out_channels=256, kernel_size=3, padding=1, stride=2, pool=True)

        self.conv5 = self.conv_block(in_channels=256, out_channels=256, kernel_size=3, padding=1)
        self.conv6 = self.conv_block(in_channels=256, out_channels=256, kernel_size=3, padding=1, pool=True)

        self.ffn1 = self.FFN(in_features = 256*8*8, out_features = 4096, flatten=True)
        self.ffn2 = self.FFN(in_features = 4096, out_features = 1024)
        self.ffn3 = self.FFN(in_features = 1024, out_features = 512)
        self.ffn4 = self.FFN(in_features = 512, out_features = 2)


    def forward(self, xb):
        out = self.conv1(xb) # (batch, 32, 256, 256)
        out = self.conv2(out) # (batch, 64, 64, 64)
        out = self.conv3(out) # (batch, 128, 64, 64)
        out = self.conv4(out) # (batch, 256, 16, 16)
        out = self.conv5(out) # (batch, 256, 16, 16)
        out = self.conv6(out) # (batch, 256, 8, 8)
        out = self.ffn1(out) # (batch, 4096) 
        out = self.ffn2(out) # (batch, 1024)
        out = self.ffn3(out) # (batch, 512) 
        out = self.ffn4(out) # (batch, 2)
        return out

    def conv_block(self, in_channels, out_channels, kernel_size, padding, stride=1, pool=False):
        layers = [
            nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, padding=padding, stride=stride),
            nn.ReLU()
        ]
        if pool:
            layers.append(nn.MaxPool2d(2, 2))
        
        return nn.Sequential(*layers)


    def FFN(self, in_features, out_features, flatten = False):
        layers = [
            nn.Linear(in_features=in_features, out_features=out_features),
            nn.ReLU()
        ]

        if flatten:
            layers = [nn.Flatten()] + layers

        return nn.Sequential(*layers)

Number of parameters: 73402690
output (In each training step, losses are not decreasing…):
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000
val_loss: 0.6931 val_roc: 0.5000

self.FFN seems to be adding ReLU as the last layer always.
That could be the issue, as the final layer of the model self.ffn4 doesn’t seem to be needing ReLU layer at the end.

Thank you for your answer, this solves my question.