Getting the value of loss_bt =0 from the very start of training

I am training a technique from github repository.
I have done some changes in the training file as i have added weight_decay =0.5, as nowhere it was mentioned.

I run it for 50 epochs, and the issue is that loss_bt=0 from the very start of training.
Please help mt to get to know he reason about it

Here is the training code

print('train network begin..')
# ------------------------------------ step 3/5  ------------------------------------
 
criterion = nn.L1Loss() # loss function
# criterion = nn.BCELoss() # loss function to compare
#criterion_bet = nn.MSELoss(reduction=False)
criterion_bet = nn.MSELoss()
print("criterion_bet= ",criterion_bet)
ssim_loss = pytorch_ssim.SSIM(window_size = 11) #to maximize this loss
print("ssim_loss= ",ssim_loss)
#optimizer = optim.SGD(net.parameters(), lr=opt.lr_init, momentum=0.9, dampening=0.1)
optimizer = optim.Adam(net.parameters(), lr=opt.lr_init)
print("optimizer= ",optimizer)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=300, gamma=0.5)
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1000, gamma=0.5)
print("scheduler= ",scheduler)
# ------------------------------------ step 4/5  --------------------------------------------------

start_time = time.time()
for epoch in range(opt.max_epoch):
    loss_sigma = loss_gradients_sum = loss_between_pair_sum = loss_pair_lable_sum = loss_ssim_sum = 0.0  # loss
    scheduler.step()
    for i, data in enumerate(train_loader):
        # images and labels
        input1, input2, gt1, gt2, lb1, lb2, gtimg = data
        input1, input2, gt1, gt2, gtimg = Variable(input1).cuda(), Variable(input2).cuda(), Variable(gt1).cuda(), Variable(gt2).cuda(), Variable(gtimg).cuda().requires_grad_(False)
        inputs = torch.cat((input1, input2), 1) #concat input pair image tensor
        #print("inputs shape=",inputs.shape)
        labels = torch.cat((gt1, gt2), 1).requires_grad_(False) #concat ground truth image tensor 
        #print("labels shape=",labels.shape)
        # forward, backward, update weights
        optimizer.zero_grad()
        outputs = net(inputs)
        #print("outputs shape= ",outputs.shape)
        #print(outputs)
        loss_pair_lable = criterion(outputs,labels) # loss for each image prediction and gt
        g1 = outputs[:,0,:,:].unsqueeze(1) * (input1.data ) # predict feature_map1
        #print("g1 shape=",g1.shape)
        g2 = outputs[:, 1, :, :].unsqueeze(1) * (input2.data) #predict feature_map2
        #print("g2 shape=",g2.shape)
        #g3 = outputs[:, 2, :, :].unsqueeze(1) * (input3.data) #predict feature_map3 
        mergeimg = g1 + g2
        #print("mergeimg shape=",mergeimg.shape)
        #cv2.imshow("mergeimg",mergeimg)
        loss_ssim = 1.0 - ssim_loss(mergeimg.cuda(),gtimg.cuda())
        loss_gradients = GL.gradient_loss_merge(mergeimg,gtimg,opt.cuda,device=0)
        #loss of 1-A-B
        sumpreds = outputs[:,0,:,:] + outputs[:,1,:,:]
        #print("sumpreds=",sumpreds)
        sumpreds = 1.0-sumpreds.unsqueeze(1)
        #print("sumpreds",sumpreds)
        zeroimg = torch.zeros(sumpreds.size())
        loss_between_pair = criterion_bet(sumpreds.cuda(),zeroimg.cuda())
        #print("loss_between_pair=",loss_between_pair)
        loss = 0.8*loss_pair_lable +0.1*loss_ssim +0.1*loss_gradients

        loss.backward()

        torch.nn.utils.clip_grad_norm(net.parameters(), 0.5)
        optimizer.step()

        loss_sigma += loss.item()
        loss_gradients_sum += loss_gradients.item()
        loss_between_pair_sum += loss_between_pair.item()
        loss_pair_lable_sum += loss_pair_lable.item()
        loss_ssim_sum += loss_ssim.item()
        # print information for each 10 iteration
        if i % 10 == 9:
            loss_avg = loss_sigma / 10
            loss_pair_lable_avg = loss_pair_lable_sum / 10
            loss_between_pair_avg = loss_between_pair_sum / 10
            loss_gradients_avg = loss_gradients_sum / 10
            loss_ssim_avg = loss_ssim_sum / 10
            loss_sigma = loss_pair_lable_sum = loss_between_pair_sum = loss_gradients_sum = loss_ssim_sum = 0.0
            print(
                "Training: Epoch[{:0>3}/{:0>3}] Iteration[{:0>3}/{:0>3}] lr:{} Loss: {:.4f} Loss_pair: {:.4f} Loss_bt: {:.4f} Loss_grads: {:.4f} Loss_ssim: {:.4f} ".format(
                    epoch + 1, opt.max_epoch, i + 1, len(train_loader),  scheduler.get_lr()[0], loss_avg, loss_pair_lable_avg,
                    loss_between_pair_avg, loss_gradients_avg, loss_ssim_avg))
     # record loss
            writer.add_scalars('Loss_group', {'train_loss': loss_avg}, epoch)
            # record learning rate
            writer.add_scalar('learning rate', scheduler.get_lr()[0], epoch)
            # record loss_pair_lable_avg
            writer.add_scalar('Loss pair label',loss_pair_lable_avg, epoch)
            # record loss_between_pair
            writer.add_scalar('Loss between pair',loss_between_pair_avg, epoch)
            # record loss_gradients_avg
            writer.add_scalar('Loss gradients',loss_gradients_avg, epoch)
            # record loss_ssim
            writer.add_scalar('Loss ssim',loss_ssim_avg, epoch)
        if epoch % 10 == 0:  # save mask for each 10 epoch
            train_mask_result_dir = os.path.join(opt.outtrain, str(time_str), str(epoch))
            if not os.path.exists(train_mask_result_dir):
                os.makedirs(train_mask_result_dir)
            for i in range(0,outputs.size(0)):
                outmask1 = outputs[i,0,:,:].squeeze().unsqueeze(-1).repeat(1,1,3)
                mask_id1 = lb1[i]
                outmask2 = outputs[i, 1, :, :].squeeze().unsqueeze(-1).repeat(1, 1, 3)
                mask_id2 = lb2[i]
#                outmask3 = outputs[i,2,:,:].squeeze().unsqueeze(-1).repeat(1, 1, 3)
#                mask_id3 = lb3[i]
                #--------------------------
                output = outputs[i,0,:,:].squeeze().unsqueeze(-1).repeat(1,1,3)
                cv2.imwrite(train_mask_result_dir + '/' + str(mask_id1) + '.jpg', output.cpu().data.numpy() * 255)
                #---------------------------
                cv2.imwrite(train_mask_result_dir +'/' + str(mask_id1) + '.jpg', outmask1.cpu().data.numpy()*255)
                cv2.imwrite(train_mask_result_dir + '/' + str(mask_id2) + '.jpg', outmask2.cpu().data.numpy() * 255)
 #               cv2.imwrite(train_mask_result_dir + '/' + str(mask_id3) + '.jpg', outmask3.cpu().data.numpy() * 255)
                cv2.imwrite(train_mask_result_dir + '/' + str('m_'+ mask_id1) + '.jpg', np.clip((outmask1.cpu().data.numpy()*255 + outmask2.cpu().data.numpy() * 255),0,255))

       # for each epoch, record
    for name, layer in net.named_parameters():
#        if layer.grad is None:
#            print("layer.grad is None")
#        else:
#            print("layer.grad is not None")
        writer.add_histogram(name + '_grad', layer.grad.cpu().data.numpy(), epoch)
        writer.add_histogram(name + '_data', layer.cpu().data.numpy(), epoch)

#        print("done train")

I have print the sumpreds, and zeroimg and loss_between_pair

sumpreds= tensor([[[1.0014, 1.0000, 1.0000, …, 1.0000, 1.0000, 1.0000],
[1.0000, 1.0000, 1.0000, …, 1.0000, 1.0000, 1.0000],
[1.0001, 1.0000, 1.0000, …, 1.0000, 1.0000, 1.0000],
…,
[1.0000, 1.0000, 1.0000, …, 1.0000, 1.0000, 1.0000],
[1.0000, 1.0000, 1.0000, …, 1.0000, 1.0000, 1.0000],
[1.0000, 1.0000, 1.0000, …, 1.0000, 1.0000, 1.0000]],

    [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     ...,
     [1.0001, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     [1.0005, 0.9995, 1.0000,  ..., 1.0000, 1.0001, 0.9994],
     [1.0005, 0.9997, 1.0000,  ..., 0.9999, 0.9999, 1.0034]],

    [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     ...,
     [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     [0.9994, 0.9984, 0.9984,  ..., 1.0000, 1.0000, 0.9999],
     [0.9972, 0.9974, 0.9975,  ..., 1.0002, 1.0000, 1.0011]],

    [[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0020, 1.0020],
     [1.0000, 1.0000, 1.0000,  ..., 1.0009, 1.0061, 0.9821],
     [1.0000, 1.0000, 1.0000,  ..., 1.0001, 1.0012, 0.9987],
     ...,
     [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
     [1.0000, 1.0000, 1.0000,  ..., 1.0001, 1.0000, 0.9996],
     [0.9996, 1.0000, 1.0000,  ..., 1.0014, 1.0000, 1.0030]]],
   device='cuda:0', grad_fn=<AddBackward0>)

sumpreds tensor([[[[-1.4184e-03, -1.3471e-05, -1.5497e-06, …, 0.0000e+00,
0.0000e+00, 5.9605e-08],
[-2.1458e-05, 1.0788e-05, 2.0862e-06, …, 0.0000e+00,
0.0000e+00, 1.7881e-07],
[-5.6624e-05, 5.1856e-06, 4.1723e-07, …, 0.0000e+00,
0.0000e+00, 5.9605e-08],
…,
[-1.7166e-05, -1.1921e-07, 0.0000e+00, …, 0.0000e+00,
0.0000e+00, 5.9605e-08],
[ 1.2100e-05, 2.2352e-05, 5.9605e-08, …, 0.0000e+00,
0.0000e+00, 0.0000e+00],
[-2.4319e-05, 5.4240e-06, 5.9605e-07, …, 5.9605e-08,
2.9802e-07, 0.0000e+00]]],

    [[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  5.9605e-08,
        1.1921e-07, -2.8610e-06],
      [-1.1921e-07,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
       -1.1921e-07,  1.6809e-05],
      [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
        0.0000e+00,  5.9605e-08],
      ...,
      [-1.2350e-04, -3.2187e-06,  1.1921e-07,  ...,  5.9605e-08,
       -3.5763e-06, -2.7418e-06],
      [-5.0545e-04,  5.4896e-04,  5.3644e-07,  ...,  6.0201e-06,
       -8.1539e-05,  6.0880e-04],
      [-5.2428e-04,  3.0160e-04,  3.6359e-06,  ...,  1.3304e-04,
        5.3585e-05, -3.3894e-03]]],


    [[[-4.7684e-07,  5.9605e-08,  2.9802e-07,  ...,  0.0000e+00,
        5.9605e-08,  7.1526e-07],
      [-3.5763e-06,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
        5.9605e-08,  6.4373e-06],
      [-1.1921e-07,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
        0.0000e+00,  0.0000e+00],
      ...,
      [-7.1526e-07, -3.5763e-07,  1.7881e-07,  ...,  6.5565e-07,
       -1.0729e-06,  6.5565e-07],
      [ 5.8669e-04,  1.6177e-03,  1.5993e-03,  ..., -1.1563e-05,
        2.7418e-06,  9.5725e-05],
      [ 2.7945e-03,  2.5978e-03,  2.5083e-03,  ..., -2.2948e-04,
        7.3910e-06, -1.0865e-03]]],


    [[[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -2.6226e-05,
       -1.9660e-03, -2.0355e-03],
      [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -8.6558e-04,
       -6.1114e-03,  1.7874e-02],
      [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ..., -7.6771e-05,
       -1.2087e-03,  1.2775e-03],
      ...,
      [-4.6492e-05,  0.0000e+00,  0.0000e+00,  ...,  2.3842e-06,
       -6.1989e-06,  1.4305e-05],
      [-3.5763e-07,  5.9605e-08,  0.0000e+00,  ..., -1.3196e-04,
       -1.5616e-05,  3.7968e-04],
      [ 4.0144e-04,  1.5497e-05,  3.3379e-06,  ..., -1.3664e-03,
       -5.9605e-07, -3.0183e-03]]]], device='cuda:0',
   grad_fn=<RsubBackward1>)

zeroimg= tensor([[[[0., 0., 0., …, 0., 0., 0.],
[0., 0., 0., …, 0., 0., 0.],
[0., 0., 0., …, 0., 0., 0.],
…,
[0., 0., 0., …, 0., 0., 0.],
[0., 0., 0., …, 0., 0., 0.],
[0., 0., 0., …, 0., 0., 0.]]],

    [[[0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      ...,
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.]]],


    [[[0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      ...,
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.]]],


    [[[0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      ...,
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.],
      [0., 0., 0.,  ..., 0., 0., 0.]]]])

loss_between_pair= tensor(2.4461e-07, device=‘cuda:0’, grad_fn=)

and the result from last epoch is (I haven’t taken screenshot of very 1st epoch)

Training: Epoch[050/050] Iteration[4950/5000] lr:1.105429575052089e-78 Loss: 0.0040 Loss_pair: 0.0022 Loss_bt: 0.0000 Loss_grads: 0.0200 Loss_ssim: 0.0023
Training: Epoch[050/050] Iteration[4960/5000] lr:1.105429575052089e-78 Loss: 0.0049 Loss_pair: 0.0032 Loss_bt: 0.0000 Loss_grads: 0.0211 Loss_ssim: 0.0024
Training: Epoch[050/050] Iteration[4970/5000] lr:1.105429575052089e-78 Loss: 0.0059 Loss_pair: 0.0049 Loss_bt: 0.0000 Loss_grads: 0.0178 Loss_ssim: 0.0023
Training: Epoch[050/050] Iteration[4980/5000] lr:1.105429575052089e-78 Loss: 0.0075 Loss_pair: 0.0065 Loss_bt: 0.0000 Loss_grads: 0.0198 Loss_ssim: 0.0025
Training: Epoch[050/050] Iteration[4990/5000] lr:1.105429575052089e-78 Loss: 0.0033 Loss_pair: 0.0018 Loss_bt: 0.0000 Loss_grads: 0.0168 Loss_ssim: 0.0017
Training: Epoch[050/050] Iteration[5000/5000] lr:2.7635739376302223e-79 Loss: 0.0052 Loss_pair: 0.0039 Loss_bt: 0.0000 Loss_grads: 0.0183 Loss_ssim: 0.0024

Please help

Please help anyone. Also tell whether it is okay to use scheduler.step() at the beginning or it should come after optimizer.step()