Why my network loss is constant and the network is not training?

Im trying to train a simple model on CIFAR10 and im using the Pytorch tutorial with adding just more layers. can someone please help me and tell me why my loss is constant…am i doing something wrong?

import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torchvision.transforms as transforms

#loadind CIFAR10
transform = transforms.Compose(
        [transforms.ToTensor(),
         transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))])

trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform)

trainloader = torch.utils.data.DataLoader(trainset, batch_size=4,
                                          shuffle=True, num_workers=2)

testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=1,
                                         shuffle=False, num_workers=2)

classes = ('plane', 'car', 'bird', 'cat',
           'deer', 'dog', 'frog', 'horse', 'ship', 'truck')


# defining Network:


class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        
        self.conv11 = nn.Conv2d(3, 64, 3, padding = 1 )
        self.pool1 = nn.AvgPool2d(2, 2)

        self.conv21 = nn.Conv2d(64, 64*2, 3, padding = 1 )
        self.pool2 = nn.AvgPool2d(2, 2)
        
        self.conv31 = nn.Conv2d(64*2, 64*4, 3, padding = 1 )
        self.pool3 = nn.AvgPool2d(2, 2)
        
        self.conv41 = nn.Conv2d(64*4, 64*8, 3, padding = 1 )
        self.pool4 = nn.AvgPool2d(2, 2)

        self.conv51 = nn.Conv2d(64*8, 64*16, 1)
        self.conv52 = nn.Conv2d(64*16, 10, 1)
        self.pool5 = nn.AvgPool2d(2, 2)
        
    def forward(self, x):
        
        x = F.relu(self.conv11(x))
        x = self.pool1(x)

        x = F.relu(self.conv21(x))
        x = self.pool2(x)
        
        x = F.relu(self.conv31(x))
        x = self.pool3(x)

        x = F.relu(self.conv41(x))
        x = self.pool4(x)
        
        x = F.relu(self.conv51(x))
        x = F.relu(self.conv52(x))
        
        x = self.pool5(x)
        
        x = x.view(-1, 10)
        
        return x


net = Net()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
net.to(device)

# =============================================================================
# 3. Define a Loss function and optimizer
# =============================================================================
import torch.optim as optim
import torch.nn as nn
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(net.parameters(), lr=0.01, momentum=0.9)
# =============================================================================
# 4. Train the network
# =============================================================================
  
for epoch in range(30):  # loop over the dataset multiple times
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):

        # get the inputs
        inputs, labels = data     
        inputs, labels = inputs.to(device), labels.to(device)

        optimizer.zero_grad()
        # forward + backward + optimize
        outputs = net(inputs)      
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        # print statistics
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 2000))
            running_loss = 0.0
            
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    
    print('Accuracy of the network on the 10000 test images: %d %%' % (
        100 * correct / total))
    

The output is so weird:

Files already downloaded and verified
Files already downloaded and verified
cuda:0

[1,  2000] loss: 2.303
[1,  4000] loss: 2.303
[1,  6000] loss: 2.303
[1,  8000] loss: 2.303
[1, 10000] loss: 2.303

I think the problem is the last RELU activation.
Placing that prevents the output of the net from being negative and this in turns prevents the softmax from working properly.
With softmax I mean the one which is used in the loss (if you don’t consider the tricks added for numerical stability)

Try to remove it as in the following snippet

def forward(self, x):
        
        x = F.relu(self.conv11(x))
        x = self.pool1(x)

        x = F.relu(self.conv21(x))
        x = self.pool2(x)
        
        x = F.relu(self.conv31(x))
        x = self.pool3(x)

        x = F.relu(self.conv41(x))
        x = self.pool4(x)
        
        x = F.relu(self.conv51(x))
        x = self.conv52(x) #x = F.relu(self.conv52(x))
        
        x = self.pool5(x)
        
        x = x.view(-1, 10)
        
        return x
3 Likes