Hi,
I am trying to add a penalty to my loss function, but it seems that the penalty does not have a backward pass even though all elements of the penalty have been declared as a Variable.
The code that forms my penalty is as follows
def make_fisher_matrix(self, previous_dataset, previous_batch_size, prev_nums):
print("making_fisher")
prev_idxs = get_labels_indices(previous_dataset.train_labels, prev_nums)
loader = DataLoader(previous_dataset, batch_size=previous_batch_size, sampler=torch.utils.data.sampler.SubsetRandomSampler(prev_idxs))
likelihoods = []
self.eval()
#init matrices
self.fisher_matrix = {n:p.clone().zero_() for n,p in self.named_parameters()}
for k, (i_data,label) in enumerate(loader):
data = Variable(i_data)
label = Variable(label)
previous_prediction = self(data)
log_pp = F.log_softmax(previous_prediction,dim=1)#take a log and softmax
likelihood = F.nll_loss(log_pp, label)
likelihood.backward(retain_graph=True)
#print(likelihood_grad)
for n,p in self.named_parameters():
self.fisher_matrix[n] = (p.grad.clone() ** 2) / len(previous_dataset)
self.prev_parameters = {n:p.clone() for n,p in self.named_parameters()}
def get_ewc_loss(self,lamda, debug=False):
try:
losses = Variable(torch.zeros(1))
for n,p in self.named_parameters():
p.requires_grad = True
pp_fisher = Variable(self.fisher_matrix[n])
pp = Variable(self.prev_parameters[n])
loss = (pp_fisher*((p - pp)**2)).sum()
losses += loss
return (Variable((lamda/2)*(losses)))
except:
return (Variable(torch.zeros(1)))
I use my ‘get_ewc_loss’ function in the following manner:
for batch, (data, label) in enumerate(self.train_loader):
if self.use_gpu:
input_data, g_label = Variable(data.cuda()), Variable(label.cuda())
else:
input_data, g_label = Variable(data), Variable(label)
self.opt.zero_grad()
output_vector = self.model(input_data)
batch_error = self.criterion(output_vector, g_label)
ewc_error = self.model.get_ewc_loss(lamda = self.lamda, debug=False)
#final_error = batch_error + ewc_error
batch_error.backward()
ewc_error.backward()
#final_error = batch_error + ewc_error
#final_error.backward()
train_error += final_error.item()
ewc_t += ewc_error.item()
self.opt.step()
Now initially I had final_error = ewc_error + batch_error, but my ewc_error was not reducing at all, thus i thought it seemed likely that ewc_error was not impacting the gradient descent at all. But now when i explicitly use ewc_error.backward() i get the error:
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
I thought if i use Variables and incorporate the parameters of the model into my loss penalty, the autograd function should recognise this as a tensor that requires grad? If that is incorrect any suggestions on how to implement a custom penalty would be greatly appreciated!