Same process, different results

I tried to instead the double-looping structure (marked as ‘A’) in the loss function with a generator (marked as ‘B’).After the first calculation, A and B can calculate the results of the same level, such as 44.123(A) and 44.231(B).After the second calculation of loss function, A can be reduced to 20.456, but B is still 44.xxx. And after the third loss calculation, A is reduced to 10.xxx, while B still has no obvious change.The two networks are exactly the same except for loss.
I feel like there was no successful back propagation.
In the last step of B,

if IS_USE_GPU:
        temp_loss = Variable(temp_loss, requires_grad=True)
    else:
        temp_loss = Variable(temp_loss, requires_grad=True)

was forced to be added, but it didn’t work.
Has anyone been in a similar situation?Thanks in advance for your guidance.

A:

def MyLoss(pred, label):
    if IS_USE_GPU:
        temp_loss = Variable(torch.tensor(0.).cuda(), requires_grad=True)
        Ld = Variable(torch.tensor(0.).cuda(), requires_grad=True)
        Lh = Variable(torch.tensor(0.).cuda(), requires_grad=True)
    else:
        temp_loss = Variable(torch.tensor(0.), requires_grad=True)
        Ld = Variable(torch.tensor(0.), requires_grad=True)
        Lh = Variable(torch.tensor(0.), requires_grad=True)
        
    tpred1 , tpred2 = pred 
    for inxi, di in enumerate(tpred2, 0):
        tempLd1 = torch.norm(torch.abs(di)-1., p=1)
        if(inxi == len(label)): 
            break
        else:
            for inxj, dj in enumerate(tpred2[inxi+1:], inxi+1):
                tempLd = tempLd1 + torch.norm(torch.abs(dj)-1., p=1)
                Ld = Ld + tempLd
                if label[inxi] == label[inxj]:
                    Rij = 1.0
                else:
                    Rij = 0.0
                tempLh1 = Rij * torch.sum(torch.pow(di - dj, 2))
                tempLh2 = (1.0 - Rij)* max(48.0 - torch.sum(torch.pow(di - dj, 2)), 0.)
                tempLh = 0.5*(tempLh1 + tempLh2)
                Lh = Lh + tempLh
    Lh = Lh/comb(len(label),2)
    Ld = Ld/comb(len(label),2)
    temp_loss = Lh + 0.01*Ld
    # release GPU
    del tempLh1, tempLh2, tempLd, tpred1, tpred2  
    torch.cuda.empty_cache()

    return temp_loss, Lh, Ld

B:

def MyLoss(pred, label):
    Ld = torch.tensor(0.)
    Lh = torch.tensor(0.)
    tpred1 , tpred2 = pred 
    for indx, di in enumerate(tpred2, 0):
        tempLd1 = torch.norm(torch.abs(di)-1., p=1)
        tempLd = torch.tensor(list((tempLd1+ torch.norm(torch.abs(dj)-1., p=1) for dj in tpred2[indx+1:])))
        tempLd = torch.sum(tempLd)
        Rij = torch.tensor(list(label[indx]==label[indx+1:])).float()
        Rij_rev = 1.0 - Rij
        temp_dis1 = torch.tensor(list((torch.sum(torch.pow(di - dj, 2)) for dj in tpred2[indx+1:]))).float()
        temp_dis2 = torch.tensor(list((max(48.0 - torch.sum(torch.pow(di - dj, 2)), 0.) for dj in tpred2[indx+1:]))).float()
        tempLh = 0.5*(torch.sum(temp_dis1.mul(Rij)) + torch.sum(temp_dis2.mul(Rij_rev)))
        Lh = Lh + tempLh
        Ld = Ld + tempLd
    
    Lh = Lh/comb(len(label),2)
    Ld = Ld/comb(len(label),2)
    temp_loss = Lh + 0.01*Ld
    if IS_USE_GPU:
        temp_loss = Variable(temp_loss, requires_grad=True)
    else:
        temp_loss = Variable(temp_loss, requires_grad=True)
    
    # release
    del tempLd1,temp_dis1,temp_dis2, tempLd, tpred1, tpred2,Rij,Rij_rev
    torch.cuda.empty_cache()
    return temp_loss, Lh, Ld```