Same Network gives different losses

thesekyi · September 19, 2020, 8:51am

I have two networks which are the same. One network uses nn.Sequential and the other uses a combination of different nn.X. However, when I train them on the same image, network A seem to converge but network B doesn’t

Network A

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5,stride=1)
        self.pool  = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5,stride=1)
        self.fc1   = nn.Linear(16 * 4 * 4, 120)
        self.fc2   = nn.Linear(120, 84)
        self.fc3   = nn.Linear(84, 3)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 16 * 4 * 4)
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        x = self.fc3(x)
        return x

Network B

import torch
from torch import nn

import functools
import operator

class CNN(nn.Module):
    """Basic Pytorch CNN implementation"""

    def __init__(self, in_channels, out_channels, input_dim):
        nn.Module.__init__(self)
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )

        num_features_before_fcnn = functools.reduce(operator.mul, list(self.features(torch.rand(1, *input_dim)).shape))

        self.classifier = nn.Sequential(
            nn.Linear(in_features=num_features_before_fcnn, out_features=120),
            nn.Linear(in_features=120, out_features=84),
            nn.Linear(in_features=84, out_features=out_channels),

        )

    def forward(self, x):
        out = self.features(x)
        out = out.view(-1, 16 * 4 * 4)  # flatten the vector
        out = self.classifier(out)
        return out

Training loop

criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

def train(criterion,model,dl,optimizer,epoch,device,log_freq):
    lost_lst = []
    running_loss = 0.0
    for i in range(epoch):
        for batch_idx,(data,target) in enumerate(dl):
            data,target = data.to(device),target.to(device)
            optimizer.zero_grad()
        
            output = model(data)
            loss = criterion(output,target).to(device)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            if i % log_freq == 0:
                print(f"Epoch {i} batch index {batch_idx} loss {running_loss/log_freq}")
                lost_lst.append(running_loss)
                running_loss = 0.0
    return lost_lst         
``
I maintained all other things(optimizer,epoch,criterion etc)  for training in both cases
Am I missing something?

hash-ir · September 20, 2020, 10:08pm

Are you sure they are the same networks? In network B, the linear layers are not followed by any activation unlike in network A. This essentially means that there is a one big linear layer which makes your classifier.

thesekyi · September 21, 2020, 6:02am

You’re right. I missed out on that. So essential, the replica of A is

import torch
from torch import nn

import functools
import operator

class CNN(nn.Module):
    """Basic Pytorch CNN implementation"""

    def __init__(self, in_channels, out_channels, input_dim):
        nn.Module.__init__(self)
        self.features = nn.Sequential(
            nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),

            nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2),
        )

        num_features_before_fcnn = functools.reduce(operator.mul, list(self.features(torch.rand(1, *input_dim)).shape))

        self.classifier = nn.Sequential(
            nn.Linear(in_features=num_features_before_fcnn, out_features=120),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=120, out_features=84),
            nn.ReLU(inplace=True),
            nn.Linear(in_features=84, out_features=out_channels),

        )

    def forward(self, x):
        out = self.features(x)
        out = out.view(-1, 16 * 4 * 4)  # flatten the vector
        out = self.classifier(out)
        return out