Same Network gives different losses

I have two networks which are the same. One network uses nn.Sequential and the other uses a combination of different nn.X. However, when I train them on the same image, network A seem to converge but network B doesn’t

Network A

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5,stride=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5,stride=1)
        self.fc1   = nn.Linear(16 * 4 * 4, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 3)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Network B

import torch
from torch import nn

import functools
import operator

class CNN(nn.Module):
    """Basic Pytorch CNN implementation"""

    def __init__(self, in_channels, out_channels, input_dim):
        nn.Module.__init__(self)
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )

        num_features_before_fcnn = functools.reduce(operator.mul, list(self.features(torch.rand(1, *input_dim)).shape))

        self.classifier = nn.Sequential(
            nn.Linear(in_features=num_features_before_fcnn, out_features=120),
            nn.Linear(in_features=120, out_features=84),
            nn.Linear(in_features=84, out_features=out_channels),

        )

    def forward(self, x):
        out = self.features(x)
        out = out.view(-1, 16 * 4 * 4)  # flatten the vector
        out = self.classifier(out)
        return out

Training loop

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

def train(criterion,model,dl,optimizer,epoch,device,log_freq):
    lost_lst = []
    running_loss = 0.0
    for i in range(epoch):
        for batch_idx,(data,target) in enumerate(dl):
            data,target = data.to(device),target.to(device)
            optimizer.zero_grad()
        
            output = model(data)
            loss = criterion(output,target).to(device)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            if i % log_freq == 0:
                print(f"Epoch {i} batch index {batch_idx} loss {running_loss/log_freq}")
                lost_lst.append(running_loss)
                running_loss = 0.0
    return lost_lst         
``
I maintained all other things(optimizer,epoch,criterion etc)  for training in both cases
Am I missing something?

Are you sure they are the same networks? In network B, the linear layers are not followed by any activation unlike in network A. This essentially means that there is a one big linear layer which makes your classifier.

1 Like

You’re right. I missed out on that. So essential, the replica of A is

import torch
from torch import nn

import functools
import operator

class CNN(nn.Module):
    """Basic Pytorch CNN implementation"""

    def __init__(self, in_channels, out_channels, input_dim):
        nn.Module.__init__(self)
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )

        num_features_before_fcnn = functools.reduce(operator.mul, list(self.features(torch.rand(1, *input_dim)).shape))

        self.classifier = nn.Sequential(
            nn.Linear(in_features=num_features_before_fcnn, out_features=120),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=120, out_features=84),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=84, out_features=out_channels),

        )

    def forward(self, x):
        out = self.features(x)
        out = out.view(-1, 16 * 4 * 4)  # flatten the vector
        out = self.classifier(out)
        return out