Training loss is constant since beginning in CNN

‘’’
class Orfunction(torch.nn.Module):

def __init__(self, D_in, H, num_classes):
    super().__init__()
    self.D_in = D_in
    self.H = H
    self.D_out = num_classes
    self.conv1 = nn.Conv2d(D_in,H,kernel_size=11, stride=4, padding=2)
    self.mpool1 = nn.MaxPool2d(kernel_size=3, stride=2)
    self.conv2 = nn.Conv2d(64, 192, kernel_size=5, padding=2)
    self.mpool2 = nn.MaxPool2d(kernel_size=3, stride=2)
    self.conv3 = nn.Conv2d(192, 384, kernel_size=3, padding=1)
    self.conv4 = nn.Conv2d(384, 256, kernel_size=3, padding=1)
    self.conv5 = nn.Conv2d(256, 256, kernel_size=3, padding=1)
    self.mpool3 =   nn.MaxPool2d(kernel_size=3, stride=2)
    self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
    self.linear1 = nn.Linear(256 * 6 * 6, 4096)
    self.linear2 = nn.Linear(4096, 4096)
    self.linear3 = nn.Linear(4096, num_classes)

def forward(self, x):
    #print(self.conv1.weight)
    x = self.conv1(x)
    x = F.leaky_relu(x)
    x = self.mpool1(x)
    x = self.conv2(x)
    x = F.leaky_relu(x)
    x = self.mpool2(x)
    x = self.conv3(x)
    x = F.leaky_relu(x)
    x = self.conv4(x)
    x = F.leaky_relu(x)
    x = self.conv5(x)
    x = F.leaky_relu(x)
    x = self.mpool3(x)
    x = self.avgpool(x)
    x = x.view(x.size(0), 256 * 6 * 6)
    x = self.linear1(x)
    x = F.leaky_relu(x)
    x = self.linear2(x)
    x = F.leaky_relu(x)
    x = self.linear3(x)
    x = torch.softmax(x,1)
    #print("s",x.shape)
    #print("pobability distribution is:",x)
    return x

D_in = 3
H = 64
num_classes = 3

classes = [‘truck’,‘auto’,‘car’]
function_network = Orfunction(D_in,H,num_classes)

Device = torch.device(‘cpu’)
function_network.to(Device)

loss_function = torch.nn.CrossEntropyLoss()

vls = list(function_network.parameters())

optimizer = optim.Adam(vls, lr=0.01, weight_decay=0.001)

list1 = list()
path2 = ‘/home/prashant/racing_car/google-images-download/downloads/train’
images = os.listdir(path2)
for image in images:
img = os.path.join(path2,image)
list1.append(img)

list2 = list()
for imgs in list1:
img = cv2.imread(imgs)
img1 = cv2.resize(img, (256, 256))
list2.append(img1)

list3 = list()
for j in list2:
img1 = torch.tensor(j, dtype= torch.float,requires_grad=True)
img1 = img1.permute(2,0,1)
img1 = img1.unsqueeze(dim = 0)
list3.append(img1)

batch1 = torch.stack(list3,dim = 1)
batch1 = batch1.squeeze(dim = 0)

y = torch.tensor([0,0,0,0,0,1,1,1,1,1,2,2,2,2,2])
print(y.shape)

epochs = 20

for i in range(epochs):

print('iteration', i)

optimizer.zero_grad()
output = function_network(batch1)
print("o",output.shape)

loss = loss_function(output,y)

loss = Variable(loss,requires_grad=True)

print("loss", loss.item())

print(list(function_network.parameters())[0].grad)
init1 = list(function_network.parameters())[0].clone()

loss.backward()

optimizer.step()

init2 = list(function_network.parameters())[0].clone()

t = torch.equal(init1.data, init2.data)

print('are the same :',t)

"output is :
loss 1.099882960319519 "
are they same : True
loss 1.099882960319519 "
are they same : True
loss 1.099882960319519 "
are they same : True
loss 1.099882960319519 "
are they same : True
loss 1.099882960319519 "
are they same : True
loss 1.099882960319519 "
are they same : True
loss 1.099882960319519 "
are they same : True
loss 1.099882960319519 "
are they same : True
‘’’

I think this operation is breaking your computation graph.

The order of operations should be:

loss = loss_function(output,y)
loss.backward()
optimizer.step()
1 Like

Additionally to this, you should remove the softmax at the end of your model, since nn.CrossEntropyLoss expects logits and internally applied log_softmax on them.

1 Like

Thank you so much guys for quick reply, it is working now.

Note - Can you provide any link for understanding.
1- How to decide which optimizer we have to use(Adam vs SGD)
2- The computation graph.
3- Which operations breaks the computation graph.
4- How can I visualise these graphs while building a CNN in PyTorch.

Using ADAM:-
loss 1.1901756525039673
iteration 1
loss 400.5993347167969
iteration 2
loss 78.54071807861328
iteration 3
loss 191.35740661621094
iteration 4
loss 153.02627563476562
iteration 5
loss 29.623409271240234
iteration 6
loss 7.654780864715576
iteration 7
loss 4.365676403045654
iteration 8
loss 24.920808792114258
iteration 9
loss 3.465801954269409
iteration 10
loss 1.6431617736816406

using SGD
iteration 0
loss 1.0905007123947144
iteration 1
loss 1.0449053049087524
iteration 2
loss 1.0095171928405762
iteration 3
loss 0.9810587167739868
iteration 4
loss 0.95912104845047
iteration 5
loss 0.950375497341156
iteration 6
loss 0.9360883235931396
iteration 7
loss 0.9562820196151733

Can you explain why in the beginning, loss is increasing so high in ADAM :

For 2, 3, I think this might be a good starting point, autograd mechanics: https://pytorch.org/docs/stable/notes/autograd.html

For 1, I do not have a good answer. Normally Adam is more forgiving towards hyperparameter choices as compared to SGD. It is useful when you want to iterate quickly on a problem (to see if it works or not)

For 4, I think you can use tensorboard https://pytorch.org/docs/stable/tensorboard.html?highlight=tensorboard
or this package: https://github.com/szagoruyko/pytorchviz