Why the V_out.grad and the gradients of all mlps' weights and bias are None?

I don’t understand why loss.grad is None and all the weights and bias in the mlps are None. If the gradients are all None, I think the output loss should not decrease, but it is decreasing and then converged within about 20 epoch. Is there anything wrong? Hope my problem explaination is enough for understanding. Thanks for any help.
The codes for the network:

class ReproduceGNN(torch.nn.Module):

    def __init__(self, in_channels_bs, in_channels_ue, in_channels_h, out_channels_bs, out_channels_ue, out_channels_h,
                 d, N, num_layers):

        super(ReproduceGNN, self).__init__()

        #initialize the first layer
        self.layer1 = PreprocessingLayer(d)

        # initialize the `num_layers` ReproduceLayer layers
        self.middle_layers = []
        for i in range(num_layers):
            layer = ReproduceLayer(in_channels_bs, in_channels_ue, in_channels_h, out_channels_bs, out_channels_ue,

        self.lastLayer = PostProcessingLayer(d, N)

        self.device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
        self.to(self.device)  # move the entire model to cuda

    def forward(self, P, Noise, H):
        #feed into the preprocessing layer
        F_bs, F_ue, E = self.layer1(P, Noise, H)
        if self.device.type == "cuda":
            F_bs = F_bs.to(self.device)
            F_ue = F_ue.to(self.device)
            E = E.to(self.device)

        #feed into the updating layer
        for layer in self.middle_layers:
            F_bs, F_ue, E = layer(F_bs, F_ue, E)
            if self.device.type == "cuda":
                F_bs = F_bs.to(self.device)
                F_ue = F_ue.to(self.device)
                E = E.to(self.device)

        V_out = self.lastLayer(E, P)
        if self.device.type == "cuda":
            V_out = V_out.to(self.device)
        return V_out

The code for the costomized Loss function and train function:

def Loss(V_out, H, Noise):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    M, K, N = V_out.size()
    R = torch.zeros(K)
    temp = torch.zeros(K)
    interference = torch.zeros(K)
    H = H.to(torch.complex64)
    if device.type == "cuda":
        Noise = Noise.to(device)
    for k in range(K): 
        Signal = 0.0        
        for l in range(K):
            interference_k = 0.0
            if l != k:
                for m in range(M):
                    interference_k = interference_k + torch.matmul(torch.conj(H[m, k, :]).t(), V_out[m,l,:]) 
                interference[k] = interference[k] + abs(interference_k)**2       
        for m in range(M):
            Signal = Signal + torch.matmul(torch.conj(H[m, k, :]).t(), V_out[m,k,:])          
        temp[k] = abs(Signal) ** 2 / (interference[k] + Noise[k])  #SINR

        R[k] = torch.log2(1+temp[k])

    Rsum = -torch.sum(R)

    return Rsum

def train(P, Noise, dataset, num_epochs, lr):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    model = ReproduceGNN(in_channels_bs=64, in_channels_ue=64, in_channels_h=64, out_channels_bs=64, out_channels_ue=64,
                         out_channels_h=64, d=64, N=2, num_layers=2)

    optimizer = torch.optim.RMSprop(model.parameters(), lr)

    train_size = int(1 * len(dataset))
    print(f'train size: {train_size}')
    test_size = len(dataset) - train_size
    train_dataset, test_dataset = torch.utils.data.random_split(dataset, [train_size, test_size])

    batch_size = 64
    num_batches = (train_size + batch_size - 1) // (batch_size*num_epochs)
    print(f'num_batches: {num_batches}')  

    # define lists to save loss and rate for each epoch
    losses = []
    rate = []
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch_idx in range(num_batches):  # number of minibatches = num_batches
            start_idx = (epoch * num_batches + batch_idx) * batch_size
            end_idx = start_idx + batch_size
            subset_indices = range(start_idx, end_idx)
            subset = Subset(train_dataset, subset_indices)
            subset_loader = DataLoader(subset, batch_size=batch_size, shuffle=True)
            for data in subset_loader:
                channel = data
                if device.type == "cuda":
                    channel = channel.to(device)
                loss = 0.0
                batch_n, M, K, N = channel.size()
                for b in range(batch_n):
                    input_H = channel[b, :, :, :]
                    if device.type == "cuda":
                        input_H = input_H.to(device)
                    V_out = model(P, Noise, input_H)
                    loss = loss + Loss(V_out, input_H, Noise)
                #print(f'batch loss: {loss}')

                loss /= batch_n

                if device.type == "cuda":
                    loss = loss.cpu()
                running_loss += loss.item()

                for name, param in model.named_parameters():
                    if param.grad is None:
                        print(f'Parameter: {name}, Gradient: {param.grad}')

        epoch_loss = running_loss/num_batches

        print(f'Epoch: {epoch + 1:03d}, Training Loss: {epoch_loss:.4f}')

Printing the .grad attribute before the first backward call will show a None gradient as nothing was computed yet.
Also, you are ignoring the warning:

UserWarning: The .grad attribute of a Tensor that is not a leaf Tensor is being accessed. Its .grad attribute won't be populated during autograd.backward(). If you indeed want the .grad field to be populated for a non-leaf Tensor, use .retain_grad() on the non-leaf Tensor. If you access the non-leaf Tensor by mistake, make sure you access the leaf Tensor instead. See github.com/pytorch/pytorch/pull/30531 for more informations.

since you are trying to access the .grad attribute of non-leaf tensors.

Thanks for your reply, I just want to make sure that the learning is effective so I checked V_out.grad. I checked my code it seems like the weights and bias of all mlps in the ReproduceLayer are updated. Does this mean my network can learn effectively? I’m trying to reproduce the GNN in the paper: “Wang Y, Li Y, Shi Q, et al. Learning Cooperative Beamforming with Edge-Update Empowered Graph Neural Networks[J]. arXiv preprint arXiv:2212.08020, 2022.”, but my output is around 9.8 while theirs is 16.

That’s a good sign and indicates your model is properly learning.
To see the .grad attributes of non-leaf tensors refer to my previous post.