Gradient norm as a loss function

def gradient_calc(self, outputs, inputs, create_graph=False):
gradient = grad(outputs=outputs, inputs=inputs, grad_outputs=torch.ones(outputs.size()).cuda(self.args.device), retain_graph=True, create_graph=create_graph, only_inputs=True)[0]
gradient = gradient.view(gradient.size(0), -1)
grad_norm = (gradient.norm(2, dim=1)**2)
grad_norm_mean = grad_norm.mean()

    return gradient, grad_norm_mean

def grad_annealing(self, outputs, inputs, anneal_factor, std_factor):
    gradient, grad_norm_2 = self.gradient_calc(outputs, inputs, False)
    return gradient, 0.5*anneal_factor*grad_norm_2*std_factor

def approx_constraint(self, std, gradient, std_factor):
    # with torch.no_grad():
    rv = 3.5*(std*std_factor)*np.random.randn(self.args.batch_size, 3072) 
    # mean_vect = rv.mean(axis=0)
    # rv = rv-mean_vect
    rv = torch.from_numpy(rv).float()
    rv = rv.to(self.args.device)
    const_term = torch.mean((torch.bmm(gradient.unsqueeze(1), rv.unsqueeze(-1)).squeeze()).abs())
    return const_term

Hi guys. I m using gradient norm as a second loss function of my research topic.
But still, I m somewhat confused about ‘create_graph’ in autograd.grad.
Is it right using gradient norm as a loss function with ‘create_graph=False’ ?
Should I set ‘create_graph=True’ to use the norm of the gradient for loss function?