Can i use loss.item() for backward?

DoKyung_Lim · September 17, 2020, 7:28pm

my loss value is

loss_iden_12t = IdentLoss(l1[0], l1[1], l1[2])
loss_iden_rcs1s1t = IdentLoss(l2[0], l2[1], l2[2])
loss_iden_rcs2s2t = IdentLoss(l3[0], l3[1], l3[2])
                
loss_identy = loss_iden_12t.item() + loss_iden_rcs1s1t.item() + loss_iden_rcs2s2t.item()
                 
                
p_pred_recon = p_d(recon)
                
g_pred_recon = g_d(recon)
                

loss_G_g = CELoss(p_pred_recon, torch.ones_like(p_pred_recon))
loss_G_p = CELoss(g_pred_recon, torch.ones_like(g_pred_recon))

loss_G = 20 * loss_G_g + 30 * loss_G_p  + 100 * loss_identy

IdentLoss is my Custom Loss Function

i know that to call loss.backward() object should Tensor

but in composited loss, one of loss value can be float value??

i tried with float value loss_identy no error, but concerned about backpropagation for IdentLoss

even though i used float loss value, it backpropagated??

albanD · September 17, 2020, 7:33pm

Hi,

If you use .item(), no gradient will flow back on that side.
This “works” because you later add that python number with a Tensor that requires grad which gives you a final los which is a Tensor that requires grad.

But no the gradient won’t flow back towards the python numbers!

DoKyung_Lim · September 17, 2020, 7:42pm

thx for relpying!

i have one more question.

when execute without .item(), gpu memory usage keep increase continously so 1 epoch can’t be done

class TripletLoss(nn.Module):
    def __init__(self, a1, a2, a3):
        super(TripletLoss, self).__init__()
        self.a1 = a1
        self.a2 = a2
        self.a3 = a3
        
    def forward(self, anchor, positive, negative):
        distance_positive = (anchor - positive).pow(2).sum(1)
        distance_negative = (anchor - negative).pow(2).sum(1)
        loss1 = distance_positive - distance_negative + self.a1
        loss2 = self.a3 * (distance_positive)
        losses = loss1 + loss2
        
        return losses.mean()

class TripletNet(nn.Module):
    def __init__(self, device):
        super(TripletNet, self).__init__()
        
        self.embedding_net = Encoder_id(in_channel = 3, n_ker = 32, device=device).to(device)
        
    def forward(self, s1, s2, t):
        s1 = self.embedding_net(s1)
        s2 = self.embedding_net(s2)
        t = self.embedding_net(t)
        
        return s1, s2, t

class Encoder_id(nn.Module):
    def __init__(self, in_channel, n_ker, device):
        super(Encoder_id, self).__init__()
        
        self.encoder = nn.Sequential(
            nn.Conv2d(in_channel, n_ker, 4, 2),
            nn.LeakyReLU(),
            nn.Conv2d(n_ker, n_ker * 2, 4, 2),
            nn.LeakyReLU(),
            nn.Conv2d(n_ker * 2, n_ker * 4, 4, 2),
            nn.LeakyReLU(),
            nn.Conv2d(n_ker * 4, n_ker * 8, 4, 2),
            nn.LeakyReLU(),
            nn.Conv2d(n_ker * 8, n_ker * 16, 4, 2),
            nn.LeakyReLU()
        )
        
        self.device = device
        
    def forward(self, x):
        batch_size = x.size(0)
        
        enc = self.encoder(x)

        enc = enc.view(batch_size, -1)

        fc = nn.Linear(enc.size(1), 128).to(self.device)
        enc = fc(enc)
        enc = torch.sigmoid(enc)
        
        return enc

this is my IdentLoss function, encoder and embedding function for IdentLoss input

is any wrong my code??

BramVanroy · September 17, 2020, 7:46pm

You should post your training loop. Growing memory likely occurs because you do not clear the gradients anywhere (zero_grad).

DoKyung_Lim · September 17, 2020, 7:51pm

                origin = x[0].to(device)
                lm = x[3].to(device)
                m = x[1].to(device)
                f = x[2].to(device)                

                rec_m, (rec_f, mean_f, logvar_f, latent_f), (rec_l, mean_l, logvar_l, latent_l) = model(origin)
                                              
                # Calc loss
                             
                
                concat_s1t = torch.cat((latent_l[:10], latent_f[20:]),dim = 1)
                concat_s2t = torch.cat((latent_l[10:20], latent_f[20:]), dim = 1)
                image_without_face_st = origin[20:] - (1 - rec_m[20:])
                
                size = torch.cat((latent_l, latent_f), dim = 1).size()
                rand_latent = torch.rand(size).to(device)
                img_without_face = origin * (1 - rec_m)

                recon_s1t = netG(image_without_face_st, torch.cat((latent_l[:10], latent_f[20:]),dim = 1))
                recon_s2t = netG(image_without_face_st, torch.cat((latent_l[10:20], latent_f[20:]), dim = 1))
                
                recon = netG(img_without_face, torch.cat((latent_l, latent_f), dim = 1))
                rand_recon = netG(img_without_face,rand_latent)

                
                
                set_requires_grad(g_d, True)
                optim_g_D.zero_grad()

                g_pred_real = g_d(origin)
                g_pred_recon = g_d(recon.detach())
                g_pred_rand_recon = g_d(rand_recon.detach())

                # print(g_pred_real.size(), g_pred_recon.size(), g_pred_rand_recon.size())

                loss_G_d_real = CELoss(g_pred_real, torch.ones_like(g_pred_real))
                loss_G_d_recon = CELoss(g_pred_recon, torch.zeros_like(g_pred_recon))
                loss_G_d_rand = CELoss(g_pred_rand_recon, torch.zeros_like(g_pred_rand_recon))
                loss_G_d = (loss_G_d_rand + loss_G_d_recon + loss_G_d_real) * (1/3)
                
                loss_G_d.backward()
                optim_g_D.step()
                
                set_requires_grad(p_d, True)
                optim_p_D.zero_grad()


                p_pred_real = p_d(origin)
                p_pred_recon = p_d(recon.detach())
                p_pred_rand_recon = p_d(rand_recon.detach())

                loss_P_d_real = CELoss(p_pred_real, torch.ones_like(p_pred_real))
                loss_P_d_recon = CELoss(p_pred_recon, torch.zeros_like(p_pred_recon))
                loss_P_d_rand = CELoss(p_pred_rand_recon, torch.zeros_like(p_pred_rand_recon))
                
                loss_P_d = loss_P_d_real + loss_P_d_recon + loss_P_d_rand
                loss_P_d = loss_P_d * (1/3)

                loss_P_d.backward()
                optim_p_D.step()
                
                set_requires_grad([p_d, g_d], False)
                set_requires_grad([netG, triplet], True)
                optim_G.zero_grad()

                l1 = triplet(origin[:10], origin[10:20], origin[20:])
                l2 = triplet(recon_s1t, origin[:10], origin[20:])
                l3 = triplet(recon_s2t, origin[10:20], origin[20:])
                
                loss_iden_12t = IdentLoss(l1[0], l1[1], l1[2])
                loss_iden_rcs1s1t = IdentLoss(l2[0], l2[1], l2[2])
                loss_iden_rcs2s2t = IdentLoss(l3[0], l3[1], l3[2])
                
                loss_identy = loss_iden_12t.item() + loss_iden_rcs1s1t.item() + loss_iden_rcs2s2t.item()
                 
                
                p_pred_recon = p_d(recon)
                
                g_pred_recon = g_d(recon)
                

                loss_G_g = CELoss(p_pred_recon, torch.ones_like(p_pred_recon))
                loss_G_p = CELoss(g_pred_recon, torch.ones_like(g_pred_recon))

                loss_G = 20 * loss_G_g + 30 * loss_G_p  + 100 * loss_identy
                
                
                loss_G.backward()
                optim_G.step()

this is my traning code
do you mean optim_g_D.zero_grad(), optim_p_D.zero_grad(), optim_G.zero_grad()??

BramVanroy · September 17, 2020, 8:25pm

No offence, but that is quite messy code to dig through Yes, you can set all optimizer to zero grad, but I suspect it will be easier if you just set zero_grad on the model - that way you only have to do it once.

DoKyung_Lim · September 17, 2020, 8:39pm

sorry for dirty code

i set zero_grad on model and it has effect on initial gpu memory usage

but still keep increase

thx for reply!