Custom penalty not having any gradient flow

Hi,

I am trying to add a penalty to my loss function, but it seems that the penalty does not have a backward pass even though all elements of the penalty have been declared as a Variable.

The code that forms my penalty is as follows

def make_fisher_matrix(self, previous_dataset, previous_batch_size, prev_nums):
    print("making_fisher")
    prev_idxs = get_labels_indices(previous_dataset.train_labels, prev_nums)
    loader = DataLoader(previous_dataset, batch_size=previous_batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(prev_idxs))
    likelihoods = []
    self.eval()
    #init matrices
    self.fisher_matrix = {n:p.clone().zero_() for n,p in self.named_parameters()}
    for k, (i_data,label) in enumerate(loader):
        data = Variable(i_data)
        label = Variable(label)
        previous_prediction = self(data)
        log_pp = F.log_softmax(previous_prediction,dim=1)#take a log and softmax
        likelihood = F.nll_loss(log_pp, label)
        likelihood.backward(retain_graph=True)
        #print(likelihood_grad)


    for n,p in self.named_parameters():
        self.fisher_matrix[n] = (p.grad.clone() ** 2) / len(previous_dataset)
    self.prev_parameters = {n:p.clone() for n,p in self.named_parameters()}

def get_ewc_loss(self,lamda, debug=False):
    try: 
        losses = Variable(torch.zeros(1)) 
        for n,p in self.named_parameters():
            p.requires_grad = True
            pp_fisher = Variable(self.fisher_matrix[n])
            pp = Variable(self.prev_parameters[n])
            loss = (pp_fisher*((p - pp)**2)).sum()
            losses += loss

        return (Variable((lamda/2)*(losses)))
    except: 
        return (Variable(torch.zeros(1)))

I use my ‘get_ewc_loss’ function in the following manner:

 for batch, (data, label) in enumerate(self.train_loader):
            if self.use_gpu:
                input_data, g_label = Variable(data.cuda()), Variable(label.cuda())
            else:
                input_data, g_label = Variable(data), Variable(label)
            self.opt.zero_grad()
            output_vector = self.model(input_data)
            batch_error = self.criterion(output_vector, g_label)
            ewc_error = self.model.get_ewc_loss(lamda = self.lamda, debug=False) 
            #final_error = batch_error + ewc_error
            batch_error.backward()
            ewc_error.backward()
            #final_error = batch_error + ewc_error
            #final_error.backward()
            train_error += final_error.item()
            ewc_t += ewc_error.item()
            self.opt.step()

Now initially I had final_error = ewc_error + batch_error, but my ewc_error was not reducing at all, thus i thought it seemed likely that ewc_error was not impacting the gradient descent at all. But now when i explicitly use ewc_error.backward() i get the error:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

I thought if i use Variables and incorporate the parameters of the model into my loss penalty, the autograd function should recognise this as a tensor that requires grad? If that is incorrect any suggestions on how to implement a custom penalty would be greatly appreciated!

2 Likes

Did you figure this out?