def gradient_calc(self, outputs, inputs, create_graph=False):
gradient = grad(outputs=outputs, inputs=inputs, grad_outputs=torch.ones(outputs.size()).cuda(self.args.device), retain_graph=True, create_graph=create_graph, only_inputs=True)[0]
gradient = gradient.view(gradient.size(0), -1)
grad_norm = (gradient.norm(2, dim=1)**2)
grad_norm_mean = grad_norm.mean()
return gradient, grad_norm_mean
def grad_annealing(self, outputs, inputs, anneal_factor, std_factor):
gradient, grad_norm_2 = self.gradient_calc(outputs, inputs, False)
return gradient, 0.5*anneal_factor*grad_norm_2*std_factor
def approx_constraint(self, std, gradient, std_factor):
# with torch.no_grad():
rv = 3.5*(std*std_factor)*np.random.randn(self.args.batch_size, 3072)
# mean_vect = rv.mean(axis=0)
# rv = rv-mean_vect
rv = torch.from_numpy(rv).float()
rv = rv.to(self.args.device)
const_term = torch.mean((torch.bmm(gradient.unsqueeze(1), rv.unsqueeze(-1)).squeeze()).abs())
return const_term
Hi guys. I m using gradient norm as a second loss function of my research topic.
But still, I m somewhat confused about ‘create_graph’ in autograd.grad.
Is it right using gradient norm as a loss function with ‘create_graph=False’ ?
Should I set ‘create_graph=True’ to use the norm of the gradient for loss function?