RuntimeError: Function 'PowBackward0' returned nan values in its 0th output

Anuj_Daga · May 20, 2020, 5:01pm

RuntimeError                              Traceback (most recent call last)
<ipython-input-3-0c9f361b4bf0> in <module>
    260 if __name__ == "__main__":
    261     for epoch in range(1, epochs + 1):
--> 262         train(epoch)

<ipython-input-3-0c9f361b4bf0> in train(epoch)
    239 
    240 
--> 241         loss.backward(create_graph=False,retain_graph=True)
    242         optimizer.step()
    243 

~\Anaconda3\lib\site-packages\torch\tensor.py in backward(self, gradient, retain_graph, create_graph)
    193                 products. Defaults to ``False``.
    194         """
--> 195         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    196 
    197     def register_hook(self, hook):

~\Anaconda3\lib\site-packages\torch\autograd\__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     97     Variable._execution_engine.run_backward(
     98         tensors, grad_tensors, retain_graph, create_graph,
---> 99         allow_unreachable=True)  # allow_unreachable flag
    100 
    101 

RuntimeError: Function 'PowBackward0' returned nan values in its 0th output.

I have tried it on a sample data of 10 datapoints with parameter sharing,it works on that. but when I do increase the datasize with the parameter sharing this error occurs.

#model class
class Framework(nn.Module):
    def __init__(self,input_shape, representation_size, output_shape, fc1, fc21,fc22,fc3,fc4):
        super(Framework, self).__init__()
        self.fc1 = fc1
        self.fc21 = fc21
        self.fc22 = fc22
        self.fc3 = fc3
        self.fc4 = fc4
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
        self.elu=nn.ELU()
    
    def encoder(self, x):
        """Encode a batch of samples, and return posterior parameters for each point."""
        h1 = self.relu(self.fc1(x))
        return self.fc21(h1), self.elu(self.fc22(h1))+1
    
    def decoder(self, z):
        h3 = self.relu(self.fc3(z))
        return self.sigmoid(self.fc4(h3))
        
    def reparam(self, mu, logvar):
        """Reparameterisation trick to sample z values. 
        This is stochastic during training,  and returns the mode during evaluation."""
        
        if self.training:
            std = logvar.mul(0.5).exp_()
            eps = Variable(std.data.new(std.size()).normal_())
            return eps.mul(std).add_(mu)
        else:
            return mu
        
    def get_z(self, x):
        """Encode a batch of data points, x, into their z representations."""
        
        mu, logvar = self.encoder(x.view(-1, input_shape))
        return self.reparam(mu, logvar)
    
    def forward(self, x):
        """Takes a batch of samples, encodes them, and then decodes them again to compare."""
        mu, logvar = self.encoder(x.view(-1, input_shape))
        z = self.reparam(mu, logvar)
        return self.decoder(z), mu, logvar,z

#parameter sharing
fc1 = nn.Linear(input_shape, 512,bias=True)
torch.nn.init.xavier_uniform_(fc1.weight)
fc21 = nn.Linear(512, representation_size,bias=True)
torch.nn.init.xavier_uniform_(fc21.weight)
fc22 = nn.Linear(512, representation_size,bias=True)
torch.nn.init.xavier_uniform_(fc22.weight)
fc3 = nn.Linear(representation_size, 512,bias=True)
torch.nn.init.xavier_uniform_(fc3.weight)
fc4 = nn.Linear(512, output_shape,bias=True)
torch.nn.init.xavier_uniform_(fc4.weight)

#Complete model
model1 = Framework(input_shape, representation_size, output_shape, fc1, fc21,fc22,fc3,fc4).to(device)
model2 = Framework(input_shape, representation_size, output_shape, fc1, fc21,fc22,fc3,fc4).to(device)
model3 = Framework(input_shape, representation_size, output_shape, fc1, fc21,fc22,fc3,fc4).to(device)

#deine optimizers

f_params=model1.parameters()
s_params=model2.parameters()
t_params=model3.parameters()

dvne_params=itertools.chain(f_params,s_params,t_params)

optimizer = optim.RMSprop(dvne_params, lr=learning_rate)

#train Model
epoch_loss=[]
Auc_score=[]
def train(epoch):
    
    #for param_group in optimizer.param_groups:
        ##print(param_group['lr'], "learning rate for Auto-Encoder.")
    model1.train()
    model2.train()
    model3.train()
    
    torch.autograd.set_detect_anomaly(True)
    train_loss,Auc=0,0
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    for batch_idx, data in enumerate(train_loader):
        print(batch_idx)
        #torch.autograd.set_detect_anomaly(False)
        data1 = data[0].to(device)
        data2 = data[1].to(device)
        data3 = data[2].to(device)

        data1 = Variable(data1)
        data2 = Variable(data2)
        data3 = Variable(data3)
        
        recon_batch1, mu1, logvar1,z1 = model1(data1.float())
        recon_batch2, mu2, logvar2,z2 = model2(data2.float())
        recon_batch3, mu3, logvar3,z3 = model3(data3.float())
        
        loss = wasserstein_loss(data1,recon_batch1,z1, mu1, logvar1, data2,recon_batch1,z2, mu2, logvar2, data3,recon_batch1,z3, mu3, logvar3)

        auroc=wasserstein_acc(data1,recon_batch1,data2,recon_batch2,data3,recon_batch3)
        auroc=100*auroc

        optimizer.zero_grad()

        loss.backward(retain_graph=True)
        optimizer.step()
        #print(recon_batch1, data1)
        train_loss+=loss.item()
        Auc+=auroc.item()
        if (batch_idx) % log_interval == 0:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6}\tauc: {:.6}'.format(
                epoch, batch_idx*batch_size* len(data)/3, len(train_loader.dataset),
                100. * batch_idx / len(train_loader),
                loss.item()/ len(data),auroc))
            
    epochloss=(train_loss / (len(train_loader.dataset)))
    AUC=(Auc*batch_size/(len(train_loader.dataset)))
    #print('====> Epoch: {} Average loss: {:.6}\t Auc: {:.6f}'.format(epoch,epochloss,AUC))
    epoch_loss.append(epochloss)
    Auc_score.append(AUC)
    print('max_auc: {:.6}\tmin_loss: {:.6}\tmax_auc_epoch: {}\tmin_loss_epoch: {}'.format(max(Auc_score),min(epoch_loss),
                                                                Auc_score.index(max(Auc_score))+1,epoch_loss.index(min(epoch_loss))+1))
if __name__ == "__main__":
    for epoch in range(1, epochs + 1):
        train(epoch)
    epoch=np.arange(epochs)
    fig, ax = plt.subplots( nrows=1, ncols=1 )
    plt.plot(epoch,Auc_score, marker='o',color='r', label='Square')
    plt.plot(epoch,epoch_loss , marker='o',color='b', label='Square') 
    plt.xticks(epoch_loss, Auc_score)
    fig.savefig('curve.pdf')
    plt.close(fig)
    max(Auc_score),min(epoch_loss)

Anuj_Daga · May 20, 2020, 5:12pm

I am not using any wrong way to compute pow function.

#define loss function
def root(var):
    res=(var**0.5)
    return res

def reshape(tens):
    return torch.reshape(tens,(-1,))

def W2_dis(mean1, logvar1, mean2, logvar2):
    mean1=torch.reshape(mean1,(-1,))
    mean2=torch.reshape(mean1,(-1,))
    logvar1=torch.reshape(logvar1,(-1,))
    logvar2=torch.reshape(logvar2,(-1,))
    agg_var=torch.sum(logvar1.exp().pow(0.25)-logvar2.exp().pow(0.25))**2
    agg_mean=(torch.norm(mean1-mean2, 2))**2
    distance =(agg_mean+agg_var).pow(0.5)
    return distance

def wl1(m1, v1,m2, v2, m3, v3):
    ik=(-1*W2_dis(m1,v1,m3,v3)).exp()
    ij=W2_dis(m1,v1,m2,v2)
    r_loss=(ij**2+ik)
    return r_loss

def l2(x1,recon_x1,z1):
    x1=torch.reshape(x1,(-1,)).type(torch.cuda.FloatTensor)
    x1=x1/torch.sum(x1)
    #print(x1)
    recon_x1=torch.reshape(recon_x1,(-1,))
    bracket=torch.norm((x1*(x1 - recon_x1)),2)**2
    E_px=torch.mean(x1)
    E_qz=torch.mean(z1)
    loss=E_px*bracket*E_qz
    return loss

def wl2(x1,recon_x1,z1,x2,recon_x2,z2,x3,recon_x3,z3):
    loss1=l2(x1,recon_x1,z1)
    loss2=l2(x2,recon_x2,z2)
    loss3=l2(x3,recon_x3,z3)
    loss_final=torch.min(torch.min(loss1,loss2),torch.min(loss2,loss3))
    return loss_final

def wasserstein_loss(x1,recon_x1,z1, mu1, var1,x2,recon_x2,z2, mu2, var2,x3,recon_x3,z3, mu3, var3):
    l1_loss=wl1(mu1, var1, mu2, var2, mu3, var3)
    l2_loss=wl2(x1,recon_x1,z1,x2,recon_x2,z2,x3,recon_x3,z3)
    w_final=l1_loss+(0.4)*l2_loss
    return w_final

def rocauc_score(y_true,y_pred):
    y_true=torch.reshape(y_true,(-1,))
    y_pred=torch.reshape(y_pred,(-1,))
    y_true=y_true.detach().cpu().numpy()
    y_pred=y_pred.detach().cpu().numpy()
    #print(y_pred,y_true)
    return roc_auc_score(y_true,y_pred)

def wasserstein_acc(data1,recon_batch1,data2,recon_batch2, data3,recon_batch3):
    score1=rocauc_score(data1,recon_batch1)
    score2=rocauc_score(data2,recon_batch2)
    score3=rocauc_score(data3,recon_batch3)
    auroc=(score1+score2+score3)/3
    return auroc

ptrblck · May 21, 2020, 9:16am

How is root used and did you make sure to pass only positive values to it?

Anuj_Daga · May 21, 2020, 9:31am

Yes,I made sure,its always positive

ptrblck · May 21, 2020, 9:43am

Just for the same of debugging, I would split the pow operations, which are applying a root to the input, and add an assert statement for the input to check for negative values.

Anuj_Daga · May 24, 2020, 12:25am

yes, I found that

(agg_mean+agg_var).pow(0.5)

this was negative . thanks again

danielkorth · May 16, 2021, 5:44pm

Hey, as the Title alraedy states, im getting a ‘PowBackward0’ error. Ive looked through the other discussions here, but cant find the issue.

def contrastiveloss(self, x1, x2, y):
    diff = abs(x1 - x2)
    dist = torch.sum(torch.pow(diff, 2), 1)
    euclidian_dist = torch.sqrt(dist)
    pos = y * euclidian_dist

    #negative pair
    margin = self.hparams['margin_loss']**2
    max = torch.clamp(margin-euclidian_dist, min=0)
    neg = (1-y)*max

    loss = pos + neg
    loss = torch.sum(loss) / 2.0 / x1.size()[0]  # since data is in batches, we need to average them
    return loss

I dont see where in this code anything can go wrong, I already made the diff variable absolute.

Anuj_Daga · May 17, 2021, 12:02am

To check which variables are becoming negative, Either print them or use assert.
I don’t know but I feel that dist variable is becoming negative. use dist = abs(torch.sum(torch.pow(diff, 2), 1))

huangkeju · November 18, 2021, 6:43am

Maybe you can try euclidian_dist = torch.sqrt(dist+1e-6), as I solved a similar issue in this way. I guess it is because the grad becomes too small that it turns to nan.