NLL backward loss error

Why am I getting error of “Expected 2 or more dimensions (got 1)” while running the below code:

import torch
import torch.nn as nn
import math
import numpy as np
import time 
import torch.nn.functional as F
import torch.optim as optim

class GraphConvolution(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight.long())
        output = torch.spmm(adj, support)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dp = nn.Dropout(p=0.5)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = self.dp(x.float()).long()
        x = self.gc2(x, adj).float()
        return torch.sum(F.log_softmax(x, dim=1), dim=1)

def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    
    loss_train = F.nll_loss(output[idx_train], labels[idx_train]) 
    loss_train.backward()
    optimizer.step()

    loss_val = F.nll_loss(output[idx_val], labels[idx_val])
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'loss_val: {:.4f}'.format(loss_val.item()),
          'time: {:.4f}s'.format(time.time() - t))
    

adj = torch.randint(high=10, size=(100,100)).long()
features = torch.sum(adj, dim=1).view(1,-1).long()
features = torch.transpose(features,0,1)

labels = torch.randint(high=10, size=(100,)).long()
print("labels = ",labels.size())

idx_train = torch.LongTensor(range(20))

model = GCN(nfeat=features.shape[1],
            nhid=9,
            nclass=8)

optimizer = optim.Adam(model.parameters(), lr=0.1, weight_decay= 5e-4)

for epoch in range(10):
    train(epoch)

Code: https://github.com/tkipf/pygcn/tree/master/pygcn

This code is taken from Graph Convolutional Network paper. One more doubt that I’ve regarding this is how are the weights trained when neither the adj or features matrix have been appended with requires_grad function?

Thanks in advance.

For a multi-class classification nn.NLLLoss expects the model output to have the shape [batch_size, nb_classes] and a target in [batch_size] containing the class indices in the range [0, nb_classes-1].
Currently the model output and target both have the shape [batch_size], which will yield this error.

I’m not familiar with the model, but the last torch.sum(..., dim=1) call in GCN.forward will reduce this dimension, which might yield this issue.

Currently, the weight and bias parameters of gc1 and gc2 will be trained, as they are registered as nn.Parameters inside the modules.
Usually you don’t need to get gradients for the inputs, so you wouldn’t need to set requires_grad=True for the features and adj tensors.

What if I modify the model and instead of classification task, use it for value prediction task and hence use MSELoss function instead of NLL Loss function to minimise the loss?

class GraphConvolution(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight.long())
        output = torch.spmm(adj, support)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dp = nn.Dropout(p=0.5)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = self.dp(x.float()).long()
        x = self.gc2(x, adj).float()
        return torch.sum(F.log_softmax(x, dim=1), dim=1)

def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    print("Output = ",output.size())
    loss_train = criteria(output, adj_comp) 
    loss_train.backward()
    optimizer.step()

    
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'time: {:.4f}s'.format(time.time() - t))
    

adj = torch.randint(high=10, size=(100,100)).long()
features = torch.sum(adj, dim=1).view(1,-1).long()
features = torch.transpose(features,0,1)

adj_comp= torch.randint(high=10, size=(100,)).long()
print("adj_comp = ",adj_comp.size())

model = GCN(nfeat=features.shape[1],
            nhid=9,
            nclass=8)

criteria = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1, weight_decay= 5e-4)

for epoch in range(10):
    train(epoch)

This still doesn’t work and I get an error “element 0 of tensors does not require grad and does not have a grad_fn”.

So, basically, what loss function should I use here and what change is to be done to the code so as to get a value prediction so that I can reduce the difference between adj_comp tensor and output tensor?

Thanks for the help.

nn.MSELoss expects the output and target to have the same shape (or will broadcast it with a warning).
However, you are currently breaking the computation graph by transforming some outputs to LongTensors, which won’t get gradients.
If you remove these .long() calls, your code will work.
What’s your use case to use LongTensors and how should the gradients be calculated in this case?

By removing all long tensors,

class GraphConvolution(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight.long())
        output = torch.spmm(adj, support)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

class GCN(nn.Module):
    def __init__(self, nfeat, nhid, nclass):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nfeat, nhid)
        self.gc2 = GraphConvolution(nhid, nclass)
        self.dp = nn.Dropout(p=0.5)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = self.dp(x.float())
        x = self.gc2(x, adj)
        return torch.sum(F.log_softmax(x, dim=1), dim=1)

def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)
    print("Output = ",output.size())
    loss_train = criteria(output, adj_comp) 
    loss_train.backward()
    optimizer.step()

    
    print('Epoch: {:04d}'.format(epoch+1),
          'loss_train: {:.4f}'.format(loss_train.item()),
          'time: {:.4f}s'.format(time.time() - t))
    

adj = torch.randint(high=10, size=(100,100)).long()
features = torch.sum(adj, dim=1).view(1,-1).long()
features = torch.transpose(features,0,1)

adj_comp= torch.randint(high=10, size=(100,)).long()
print("adj_comp = ",adj_comp.size())

model = GCN(nfeat=features.shape[1],
            nhid=9,
            nclass=8)

criteria = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.1, weight_decay= 5e-4)

for epoch in range(10):
    train(epoch)

there is an error of Expected object of scalar type Float but got scalar type Long for argument #2 ‘mat2’ in call to _th_mm in the line x = self.gc2(x, adj) .

This is why I had converted the tensors to long as the matrix multiplication was feasible, but it began breaking the gradient computation.

How to proceed on this without converting to long tensors and preserving the gradients as well?

Your self.weight is still transformed to long(), so you might need to remove it as well.

I’m not familiar with your use case, but LongTensors won’t get any gradients, so you would have to stick to floating point types. Based on your use case you might want to post-process the adj matrix, if you want to use only ones and zeros in it.

Thanks. Changing the self.weight.long() to self.weight and also changing adj = torch.randint(high=10, size=(100,100)).long() to adj = torch.randint(high=10, size=(100,100)).float() and

features = torch.sum(adj, dim=1).view(1,-1).long() to features = torch.sum(adj, dim=1).view(1,-1).float() and also adj_comp= torch.randint(high=10, size=(100,)).long() to adj_comp= torch.randint(high=10, size=(100,)).float() did the trick.

That should do it, right? Thanks a lot for you help and time.

I trained the model for hours together and even longer, but still the loss doesn’t seem to converge. Are the weights getting detached and the optimization breaking up somewhere?

I’ve set the number of epochs = 10000 here, but I trained with much larger number earlier, but apparently, it doesn’t seem to be making any difference.

n=1000

adj = torch.randint(high=10, size=(n,n)).float()
features = torch.sum(adj, dim=1).view(1,-1).float()
features = torch.transpose(features,0,1)
adj_comp= torch.randint(low=1,high=100, size=(n,)).float()

class GraphConvolution(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight)
        output = torch.spmm(adj, support)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

class GCN(nn.Module):
    def __init__(self, nin, nhid1, nhid2, nout):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nin, nhid1)
        self.gc2 = GraphConvolution(nhid1, nhid2)
        self.gc3 = GraphConvolution(nhid2, nout)
        self.dp = nn.Dropout(p=0.5)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.relu(x)
        x = self.dp(x.float())
        x = self.gc2(x, adj)
        x = F.relu(x)
        x = self.dp(x.float())
        x = self.gc3(x, adj)
        x = F.relu(x)
        x = self.dp(x.float())
        return torch.sum(x, dim=1)

def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)

    loss_train = criteria(output, adj_comp) 
    loss_train.backward()
    optimizer.step()
    
    if not epoch%1000:
        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'time: {:.4f}s'.format(time.time() - t))


model = GCN(nin = features.shape[1], nhid1 = 100, nhid2 = 50, nout= 10)

criteria = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.000001)

net_time = time.time()
for epoch in range(10000):
    train(epoch)
    if not epoch%1000:
        print("Net time = {:.4f}s \n".format(time.time() - net_time))

You could check, if the computation graph is detached by checking all parameters for a valid gradient after calling loss.backward() e.g. via:

for param in model.parameters():
    print(param.grad)

If some of them are None, you might be detaching the graph.

Otherwise, I would recommend to try to overfit a small dataset (e.g. just 10 samples) and make sure your model is able to overfit it by playing around with the hyper-parameters, model architecture etc.

All of the values are coming out to be None. Hence the entire graph is getting detached, although I’m not understanding why. I’ve written the same code and I just want to convert an adjacency matrix to a vector by doing torch.sum(x, dim=1) and compare it with another vector adj_comp.

What’s wrong with the this code and why are the weights detaching?

n=1000

adj = torch.randint(high=10, size=(n,n)).float()
features = torch.sum(adj, dim=1).view(1,-1).float()
features = torch.transpose(features,0,1)
adj_comp= torch.randint(low=1,high=100, size=(n,)).float()

class GraphConvolution(nn.Module):

    def __init__(self, in_features, out_features, bias=True):
        super(GraphConvolution, self).__init__()

        self.in_features = in_features
        self.out_features = out_features
        self.weight = nn.parameter.Parameter(torch.FloatTensor(in_features, out_features))
        if bias:
            self.bias = nn.parameter.Parameter(torch.FloatTensor(out_features))
        else:
            self.register_parameter('bias', None)
        self.reset_parameters()

    def reset_parameters(self):
        stdv = 1. / math.sqrt(self.weight.size(1))
        self.weight.data.uniform_(-stdv, stdv)
        if self.bias is not None:
            self.bias.data.uniform_(-stdv, stdv)

    def forward(self, x, adj):
        support = torch.mm(x, self.weight)
        output = torch.spmm(adj, support)
        return output

    def __repr__(self):
        return self.__class__.__name__ + ' (' \
               + str(self.in_features) + ' -> ' \
               + str(self.out_features) + ')'

class GCN(nn.Module):
    def __init__(self, nin, nhid1, nhid2, nout):
        super(GCN, self).__init__()

        self.gc1 = GraphConvolution(nin, nhid1)
        self.gc2 = GraphConvolution(nhid1, nhid2)
        self.gc3 = GraphConvolution(nhid2, nout)
        self.dp = nn.Dropout(p=0.5)

    def forward(self, x, adj):
        x = F.relu(self.gc1(x, adj))
        x = F.relu(x)
        x = self.dp(x.float())
        x = self.gc2(x, adj)
        x = F.relu(x)
        x = self.dp(x.float())
        x = self.gc3(x, adj)
        x = F.relu(x)
        x = self.dp(x.float())
        return torch.sum(x, dim=1)

def train(epoch):
    t = time.time()
    model.train()
    optimizer.zero_grad()
    output = model(features, adj)

    loss_train = criteria(output, adj_comp) 
    loss_train.backward()
    optimizer.step()
    
    if not epoch%1000:
        print('Epoch: {:04d}'.format(epoch+1),
              'loss_train: {:.4f}'.format(loss_train.item()),
              'time: {:.4f}s'.format(time.time() - t))


model = GCN(nin = features.shape[1], nhid1 = 100, nhid2 = 50, nout= 10)

criteria = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.000001)

net_time = time.time()
for epoch in range(10000):
    train(epoch)
    if not epoch%1000:
        print("Net time = {:.4f}s \n".format(time.time() - net_time))

I get valid gradients for the weight parameters (bias is set to None by default in your code) using your code snippet:

gc1.weight tensor([[ 0.0000e+00,  0.0000e+00,  5.7626e+12,  0.0000e+00,  2.0992e+12,
         -4.3610e+12,  0.0000e+00,  1.7043e+12,  0.0000e+00,  0.0000e+00,
          1.0510e+12,  5.1097e+11,  1.1546e+12,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00, -1.9490e+12, -5.2885e+11, -2.5631e+11,
          0.0000e+00,  1.9785e+12,  2.1556e+11,  3.4624e+11,  3.0468e+12,
         -1.4528e+12,  2.2929e+12,  2.6372e+12,  1.0251e+12,  0.0000e+00,
          0.0000e+00, -1.3199e+12, -1.1152e+11, -3.9257e+11,  1.8377e+12,
          0.0000e+00,  0.0000e+00, -5.2774e+11,  7.5787e+12,  6.4663e+11,
          9.5647e+12,  0.0000e+00, -2.4952e+12,  0.0000e+00,  1.3465e+11,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  8.5702e+11,  4.1229e+11,
          8.5888e+11,  0.0000e+00,  1.6762e+12,  0.0000e+00, -8.6959e+11,
          2.0165e+12,  0.0000e+00,  0.0000e+00,  0.0000e+00,  1.6999e+12,
          0.0000e+00,  9.5645e+11,  1.3904e+12, -2.0582e+12, -1.1083e+12,
          0.0000e+00,  2.0443e+12,  0.0000e+00,  0.0000e+00,  9.8510e+11,
         -1.0981e+12,  0.0000e+00,  0.0000e+00, -2.1311e+12,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  2.0415e+12, -3.1155e+11,
          0.0000e+00, -1.2381e+12, -1.4073e+11,  0.0000e+00,  0.0000e+00,
         -1.0034e+11,  3.8488e+12,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  4.3684e+11, -1.7028e+12,
         -2.5951e+12, -2.4032e+12,  0.0000e+00,  3.5148e+12, -3.6833e+12]])
gc1.bias None
gc2.weight tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 4.6452e+10,  0.0000e+00,  0.0000e+00,  ...,  8.7185e+11,
         -1.2338e+11,  2.8303e+11],
        ...,
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 2.2199e+10,  0.0000e+00,  0.0000e+00,  ...,  5.8348e+11,
         -7.5911e+10,  1.6448e+11],
        [ 9.0184e+10,  0.0000e+00,  0.0000e+00,  ...,  1.3702e+12,
         -1.8119e+11,  5.0495e+11]])
gc2.bias None
gc3.weight tensor([[1.3418e+09, 1.0558e+09, 3.0883e+09, 2.7453e+09, 8.3871e+08, 2.9497e+09,
         0.0000e+00, 0.0000e+00, 1.8982e+09, 2.8017e+09],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [5.4088e+11, 4.6983e+11, 7.4464e+11, 4.8750e+11, 8.1482e+10, 6.1323e+11,
         0.0000e+00, 0.0000e+00, 4.3579e+11, 4.8442e+11],
        [6.1316e+09, 4.8247e+09, 1.4112e+10, 1.2545e+10, 3.8326e+09, 1.3479e+10,
         0.0000e+00, 0.0000e+00, 8.6741e+09, 1.2803e+10],
        [1.5149e+10, 1.0378e+10, 1.8953e+10, 1.3017e+10, 2.6907e+09, 1.9303e+10,
         0.0000e+00, 0.0000e+00, 1.4705e+10, 1.6191e+10],
        [8.3224e+09, 7.1178e+09, 1.7066e+10, 1.2104e+10, 1.7453e+09, 1.2392e+10,
         0.0000e+00, 0.0000e+00, 7.6211e+09, 1.2210e+10],
        [5.1466e+11, 4.4102e+11, 6.7929e+11, 4.3283e+11, 5.6717e+10, 5.3794e+11,
         0.0000e+00, 0.0000e+00, 3.8280e+11, 4.3141e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.7988e+10, 2.3244e+10, 3.1372e+10, 1.6246e+10, 1.4151e+09, 2.5873e+10,
         0.0000e+00, 0.0000e+00, 2.0801e+10, 1.7842e+10],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.2529e+11, 1.0351e+11, 2.0384e+11, 1.5329e+11, 3.9726e+10, 1.7360e+11,
         0.0000e+00, 0.0000e+00, 1.1277e+11, 1.6391e+11],
        [1.9331e+11, 1.7053e+11, 2.3887e+11, 1.4136e+11, 2.3857e+10, 1.8999e+11,
         0.0000e+00, 0.0000e+00, 1.3634e+11, 1.5548e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.6648e+11, 2.1048e+11, 3.6879e+11, 2.1916e+11, 3.8415e+10, 3.3483e+11,
         0.0000e+00, 0.0000e+00, 2.6191e+11, 2.2762e+11],
        [4.8012e+11, 4.0491e+11, 6.6682e+11, 4.4386e+11, 6.6911e+10, 5.6540e+11,
         0.0000e+00, 0.0000e+00, 4.0385e+11, 4.2403e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [9.6287e+10, 8.5866e+10, 1.2923e+11, 8.1884e+10, 1.1501e+10, 1.0275e+11,
         0.0000e+00, 0.0000e+00, 7.2406e+10, 7.9410e+10],
        [3.8502e+11, 3.2341e+11, 5.5472e+11, 3.8425e+11, 6.7005e+10, 4.7505e+11,
         0.0000e+00, 0.0000e+00, 3.3311e+11, 3.7214e+11],
        [1.2536e+11, 9.0180e+10, 1.4233e+11, 8.8746e+10, 1.3844e+10, 1.3854e+11,
         0.0000e+00, 0.0000e+00, 1.0551e+11, 1.0729e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.6826e+11, 1.3820e+11, 2.1675e+11, 1.4432e+11, 1.7525e+10, 1.8229e+11,
         0.0000e+00, 0.0000e+00, 1.3090e+11, 1.4944e+11],
        [1.7524e+11, 1.4594e+11, 2.3257e+11, 1.6048e+11, 3.3265e+10, 1.9856e+11,
         0.0000e+00, 0.0000e+00, 1.3721e+11, 1.7323e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.9308e+11, 2.0435e+11, 2.8336e+11, 1.8084e+11, 4.6458e+10, 1.8524e+11,
         0.0000e+00, 0.0000e+00, 1.1171e+11, 1.8113e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.9807e+11, 4.4118e+11, 5.8472e+11, 3.5972e+11, 1.0318e+10, 4.5462e+11,
         0.0000e+00, 0.0000e+00, 3.2011e+11, 3.2889e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [6.5498e+11, 5.1708e+11, 8.7470e+11, 5.8653e+11, 8.7341e+10, 7.9434e+11,
         0.0000e+00, 0.0000e+00, 5.8149e+11, 5.9509e+11],
        [7.4386e+11, 6.4328e+11, 1.0640e+12, 7.3333e+11, 1.3503e+11, 8.8885e+11,
         0.0000e+00, 0.0000e+00, 6.1150e+11, 7.2601e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [4.2528e+10, 3.4561e+10, 6.0354e+10, 3.6555e+10, 1.0606e+10, 5.1470e+10,
         0.0000e+00, 0.0000e+00, 4.1024e+10, 4.1764e+10],
        [2.0269e+11, 1.5970e+11, 2.6420e+11, 1.6884e+11, 3.2350e+10, 2.3451e+11,
         0.0000e+00, 0.0000e+00, 1.7871e+11, 1.8206e+11],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [1.1283e+10, 1.0017e+10, 2.2469e+10, 1.8430e+10, 5.4394e+09, 1.9605e+10,
         0.0000e+00, 0.0000e+00, 1.2453e+10, 1.8734e+10],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00,
         0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
        [2.6133e+11, 2.2275e+11, 3.8679e+11, 2.5941e+11, 6.0402e+10, 3.2680e+11,
         0.0000e+00, 0.0000e+00, 2.3434e+11, 2.6632e+11],
        [3.3256e+11, 2.7949e+11, 5.0830e+11, 3.3410e+11, 7.3282e+10, 4.2524e+11,
         0.0000e+00, 0.0000e+00, 3.0468e+11, 3.5502e+11],
        [1.1428e+11, 1.0545e+11, 1.6758e+11, 1.1861e+11, 1.9412e+10, 1.3258e+11,
         0.0000e+00, 0.0000e+00, 8.3063e+10, 1.1005e+11],
        [2.5735e+11, 2.1977e+11, 3.7276e+11, 2.6437e+11, 3.3609e+10, 3.1184e+11,
         0.0000e+00, 0.0000e+00, 2.1148e+11, 2.4916e+11]])
gc3.bias None

Okay, I got the gradients that you mentioned by rectifying the code, but still I’m unsure why the loss doesn’t converge even after a large amount of time. I’m training on a small graph of size (1000 X 1000) only. It’s not too large technically when the model needs to be tested and evaluated on graphs in the size of a million nodes!

I don’t know, why the model is not converging, as I’m not familiar with the code, so you might need to wait for others to have another look at it.
Since the graph is not detached, I would still try to overfit a very simple use case and play around with the hyperparameters.
E.g. could you decrease the graph to 10x10 and make sure your model can learn the targets or doesn’t it make sense for this type or model?