# Same Network gives different losses

I have two networks which are the same. One network uses `nn.Sequential` and the other uses a combination of different `nn.X`. However, when I train them on the same image, network `A` seem to converge but network `B` doesn’t

Network A

``````class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5,stride=1)
self.pool  = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5,stride=1)
self.fc1   = nn.Linear(16 * 4 * 4, 120)
self.fc2   = nn.Linear(120, 84)
self.fc3   = nn.Linear(84, 3)

def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 4 * 4)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
``````

Network B

``````import torch
from torch import nn

import functools
import operator

class CNN(nn.Module):
"""Basic Pytorch CNN implementation"""

def __init__(self, in_channels, out_channels, input_dim):
nn.Module.__init__(self)
self.features = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),

nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
)

num_features_before_fcnn = functools.reduce(operator.mul, list(self.features(torch.rand(1, *input_dim)).shape))

self.classifier = nn.Sequential(
nn.Linear(in_features=num_features_before_fcnn, out_features=120),
nn.Linear(in_features=120, out_features=84),
nn.Linear(in_features=84, out_features=out_channels),

)

def forward(self, x):
out = self.features(x)
out = out.view(-1, 16 * 4 * 4)  # flatten the vector
out = self.classifier(out)
return out
``````

Training loop

``````criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)

def train(criterion,model,dl,optimizer,epoch,device,log_freq):
lost_lst = []
running_loss = 0.0
for i in range(epoch):
for batch_idx,(data,target) in enumerate(dl):
data,target = data.to(device),target.to(device)

output = model(data)
loss = criterion(output,target).to(device)
loss.backward()
optimizer.step()

running_loss += loss.item()
if i % log_freq == 0:
print(f"Epoch {i} batch index {batch_idx} loss {running_loss/log_freq}")
lost_lst.append(running_loss)
running_loss = 0.0
return lost_lst
``
I maintained all other things(optimizer,epoch,criterion etc)  for training in both cases
Am I missing something?``````

Are you sure they are the same networks? In network B, the linear layers are not followed by any activation unlike in network A. This essentially means that there is a one big linear layer which makes your classifier.

You’re right. I missed out on that. So essential, the replica of A is

``````import torch
from torch import nn

import functools
import operator

class CNN(nn.Module):
"""Basic Pytorch CNN implementation"""

def __init__(self, in_channels, out_channels, input_dim):
nn.Module.__init__(self)
self.features = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),

nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
)

num_features_before_fcnn = functools.reduce(operator.mul, list(self.features(torch.rand(1, *input_dim)).shape))

self.classifier = nn.Sequential(
nn.Linear(in_features=num_features_before_fcnn, out_features=120),
nn.ReLU(inplace=True),
nn.Linear(in_features=120, out_features=84),
nn.ReLU(inplace=True),
nn.Linear(in_features=84, out_features=out_channels),

)

def forward(self, x):
out = self.features(x)
out = out.view(-1, 16 * 4 * 4)  # flatten the vector
out = self.classifier(out)
return out
``````