I have two networks which are the same. One network uses nn.Sequential
and the other uses a combination of different nn.X
. However, when I train them on the same image, network A
seem to converge but network B
doesn’t
Network A
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(in_channels=3, out_channels=6, kernel_size=5,stride=1)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5,stride=1)
self.fc1 = nn.Linear(16 * 4 * 4, 120)
self.fc2 = nn.Linear(120, 84)
self.fc3 = nn.Linear(84, 3)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(-1, 16 * 4 * 4)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = self.fc3(x)
return x
Network B
import torch
from torch import nn
import functools
import operator
class CNN(nn.Module):
"""Basic Pytorch CNN implementation"""
def __init__(self, in_channels, out_channels, input_dim):
nn.Module.__init__(self)
self.features = nn.Sequential(
nn.Conv2d(in_channels=in_channels, out_channels=6, kernel_size=5, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
nn.Conv2d(in_channels=6, out_channels=16, kernel_size=5, stride=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2),
)
num_features_before_fcnn = functools.reduce(operator.mul, list(self.features(torch.rand(1, *input_dim)).shape))
self.classifier = nn.Sequential(
nn.Linear(in_features=num_features_before_fcnn, out_features=120),
nn.Linear(in_features=120, out_features=84),
nn.Linear(in_features=84, out_features=out_channels),
)
def forward(self, x):
out = self.features(x)
out = out.view(-1, 16 * 4 * 4) # flatten the vector
out = self.classifier(out)
return out
Training loop
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
def train(criterion,model,dl,optimizer,epoch,device,log_freq):
lost_lst = []
running_loss = 0.0
for i in range(epoch):
for batch_idx,(data,target) in enumerate(dl):
data,target = data.to(device),target.to(device)
optimizer.zero_grad()
output = model(data)
loss = criterion(output,target).to(device)
loss.backward()
optimizer.step()
running_loss += loss.item()
if i % log_freq == 0:
print(f"Epoch {i} batch index {batch_idx} loss {running_loss/log_freq}")
lost_lst.append(running_loss)
running_loss = 0.0
return lost_lst
``
I maintained all other things(optimizer,epoch,criterion etc) for training in both cases
Am I missing something?