Nonetype gradients with respect to parameters

I’ve implemented a network comprising two sections: the initial part is an MLP with a single output (u), taking [x, t] as input. The output from the first section serves as input to the subsequent part, which computes and provides the vector [du/dx, du/dt] as the ultimate output (representing the derivatives of the MLP network’s output with respect to x and t). However, it appears that the dependency of the output vector on certain parameters has been lost. How can I rectify this issue and restore the correct dependency?
P.S. The issue encountered is " p.grad() is Nonetype" for some parameters.
Below is the code:
import torch
import torch.nn as nn
from torch.nn import functional as F
from collections import OrderedDict
import numpy as np
from choose_optimizer import *
from functorch import jacrev
from functorch import make_functional
from functorch import vmap

CUDA support

if torch.cuda.is_available():
device = torch.device(‘cuda’)
else:
device = torch.device(‘cpu’)

def div(u):
“”“Accepts a function u:R^D → R^D.”“”
J = jacrev(u)
return lambda x: torch.trace(J(x))

the deep neural network

class DNN(torch.nn.Module):
def init(self, layers, activation, use_batch_norm=False, use_instance_norm=False):
super(DNN, self).init()

    # parameters
    self.depth = len(layers) - 1

    if activation == 'identity':
        self.activation = torch.nn.Identity
    elif activation == 'tanh':
        self.activation = torch.nn.Tanh
    elif activation == 'relu':
        self.activation = torch.nn.ReLU
    elif activation == 'gelu':
        self.activation = torch.nn.GELU
    elif activation == 'sin':
        self.activation = Sine
    self.use_batch_norm = use_batch_norm
    self.use_instance_norm = use_instance_norm

    layer_list = list()
    for i in range(self.depth - 1):
        layer_list.append(
            ('layer_%d' % i, torch.nn.Linear(layers[i], layers[i+1]))
        )

        if self.use_batch_norm:
            layer_list.append(('batchnorm_%d' % i, torch.nn.BatchNorm1d(num_features=layers[i+1])))
        if self.use_instance_norm:
            layer_list.append(('instancenorm_%d' % i, torch.nn.InstanceNorm1d(num_features=layers[i+1])))

        layer_list.append(('activation_%d' % i, self.activation()))

    layer_list.append(
        ('layer_%d' % (self.depth - 1), torch.nn.Linear(layers[-2], layers[-1]))
    )
    layerDict = OrderedDict(layer_list)

    # deploy layers
    self.layers = torch.nn.Sequential(layerDict)

def forward(self, x):
    u = self.layers(x)
    u_t = torch.autograd.grad(
        u, t,
        grad_outputs=torch.ones_like(u),
        retain_graph=True,
        create_graph=True
    )[0]

        u_x = torch.autograd.grad(
        u, x,
        grad_outputs=torch.ones_like(u),
        retain_graph=True,
        create_graph=True
    )[0]
    return u_x,u_t

class PhysicsInformedNN_pbc():
“”“PINNs (convection/diffusion/reaction) for periodic boundary conditions.”“”
def init(self, system, X_u_train, u_train, X_f_train, bc_lb, bc_ub, layers, G, nu, beta, rho, optimizer_name, lr,
net, L=1, activation=‘tanh’, loss_style=‘mean’):

    self.system = system

    self.x_u = torch.tensor(X_u_train[:, 0:1], requires_grad=True).float().to(device)
    self.t_u = torch.tensor(X_u_train[:, 1:2], requires_grad=True).float().to(device)
    self.x_f = torch.tensor(X_f_train[:, 0:1], requires_grad=True).float().to(device)
    self.t_f = torch.tensor(X_f_train[:, 1:2], requires_grad=True).float().to(device)
    self.x_bc_lb = torch.tensor(bc_lb[:, 0:1], requires_grad=True).float().to(device)
    self.t_bc_lb = torch.tensor(bc_lb[:, 1:2], requires_grad=True).float().to(device)
    self.x_bc_ub = torch.tensor(bc_ub[:, 0:1], requires_grad=True).float().to(device)
    self.t_bc_ub = torch.tensor(bc_ub[:, 1:2], requires_grad=True).float().to(device)
    self.net = net

    if self.net == 'DNN':
        self.dnn = DNN(layers, activation).to(device)
    else: # "pretrained" can be included in model path
        # the dnn is within the PINNs class
        self.dnn = torch.load(net).dnn

    self.u = torch.tensor(u_train, requires_grad=True).float().to(device)
    self.layers = layers
    self.nu = nu
    self.beta = beta
    self.rho = rho

    self.G = torch.tensor(G, requires_grad=True).float().to(device)
    self.G = self.G.reshape(-1, 1)

    self.L = L

    self.lr = lr
    self.optimizer_name = optimizer_name

    self.optimizer = choose_optimizer(self.optimizer_name, self.dnn.parameters(), self.lr)

    self.loss_style = loss_style

    self.iter = 0

def net_u(self, x, t):
    """The standard DNN that takes (x,t) --> u."""
    u = self.dnn(torch.cat([x, t], dim=1))
    return u

def net_f(self, x, t):
    """ Autograd for calculating the residual for different systems."""
    u = self.net_u(x, t)

    u_t = torch.autograd.grad(
        u, t,
        grad_outputs=torch.ones_like(u),
        retain_graph=True,
        create_graph=True
    )[0]
    u_x = torch.autograd.grad(
        u, x,
        grad_outputs=torch.ones_like(u),
        retain_graph=True,
        create_graph=True
    )[0]

    u_xx = torch.autograd.grad(
        u_x, x,
        grad_outputs=torch.ones_like(u_x),
        retain_graph=True,
        create_graph=True
        )[0]

    if 'convection' in self.system or 'diffusion' in self.system:
        f = u_t - self.nu*u_xx + self.beta*u_x - self.G
    elif 'rd' in self.system:
        f = u_t - self.nu*u_xx - self.rho*u + self.rho*u**2
    elif 'reaction' in self.system:
        f = u_t - self.rho*u + self.rho*u**2
    return f

def net_b_derivatives(self, u_lb, u_ub, x_bc_lb, x_bc_ub):
    """For taking BC derivatives."""

    u_lb_x = torch.autograd.grad(
        u_lb, x_bc_lb,
        grad_outputs=torch.ones_like(u_lb),
        retain_graph=True,
        create_graph=True
        )[0]

    u_ub_x = torch.autograd.grad(
        u_ub, x_bc_ub,
        grad_outputs=torch.ones_like(u_ub),
        retain_graph=True,
        create_graph=True
        )[0]

    return u_lb_x, u_ub_x

def loss_pinn(self, verbose=True):
    """ Loss function. """
    if torch.is_grad_enabled():
        self.optimizer.zero_grad()
    u_pred = self.net_u(self.x_u, self.t_u)
    u_pred_lb = self.net_u(self.x_bc_lb, self.t_bc_lb)
    u_pred_ub = self.net_u(self.x_bc_ub, self.t_bc_ub)
    if self.nu != 0:
        u_pred_lb_x, u_pred_ub_x = self.net_b_derivatives(u_pred_lb, u_pred_ub, self.x_bc_lb, self.x_bc_ub)
    #f_pred = self.net_f(self.x_f, self.t_f)

    if self.loss_style == 'mean':
        loss_u = torch.mean((self.u - u_pred) ** 2)
        loss_b = torch.mean((u_pred_lb - u_pred_ub) ** 2)
        if self.nu != 0:
            loss_b += torch.mean((u_pred_lb_x - u_pred_ub_x) ** 2)
        #loss_f = torch.mean(f_pred ** 2)
    elif self.loss_style == 'sum':
        loss_u = torch.sum((self.u - u_pred) ** 2)
        loss_b = torch.sum((u_pred_lb - u_pred_ub) ** 2)
        if self.nu != 0:
            loss_b += torch.sum((u_pred_lb_x - u_pred_ub_x) ** 2)
        #loss_f = torch.sum(f_pred ** 2)

    loss = loss_u + loss_b #+ self.L*loss_f

    if loss.requires_grad:
        loss.backward()

    grad_norm = 0
    for p in self.dnn.parameters():
        param_norm = p.grad.detach().data.norm(2)
        grad_norm += param_norm.item() ** 2
    grad_norm = grad_norm ** 0.5

    if verbose:
        if self.iter % 100 == 0:
            print(
                'epoch %d, gradient: %.5e, loss: %.5e, loss_u: %.5e, loss_b: %.5e, loss_f: %.5e' % 
                (self.iter, grad_norm, loss.item(), loss_u.item(), loss_b.item(), loss_f.item())
            )
        self.iter += 1

    return loss

def train(self):
    self.dnn.train()
    self.optimizer.step(self.loss_pinn)

def predict(self, X):
    x = torch.tensor(X[:, 0:1], requires_grad=True).float().to(device)
    t = torch.tensor(X[:, 1:2], requires_grad=True).float().to(device)

    self.dnn.eval()
    u = self.net_u(x, t)
    u = u.detach().cpu().numpy()

    return u

The .to() operation is differentiable and will thus create a non-leaf tensor. Specify the device and dtype during the parameter creation and check if it solves the issue.

I modified each parameter as shown below:
self.x_u = torch.tensor(X_u_train[:, 0:1], device=device, dtype=dtype, requires_grad=True)

I made changes, but unfortunately, I encountered the same error. I believe this issue arises because the network calculates the gradient to determine the output([du/dx, du/dt]) of the network, causing the connection between the output and the parameters of the MLP network to disappear. Could you please provide guidance on how to maintain this connection?