Loss.backward() does not update

Hi,
Here is my code, on training the logits dont seem to change at all. What could be the reason?
from dgl.nn import GraphConv, SAGEConv

class GCN(nn.Module):
def init(self, in_feats, h_feats, num_classes):
super(GCN, self).init()
#self.conv1 = GraphConv(in_feats, h_feats, allow_zero_in_degree=True)
#self.conv2 = GraphConv(h_feats, num_classes, allow_zero_in_degree=True)
‘’’
self.conv1 = SAGEConv(in_feats, 1024, ‘pool’)
self.conv2 = SAGEConv(1024, 512 , ‘pool’)
self.conv3 = SAGEConv(512, 256, ‘pool’)
self.conv4 = SAGEConv(256,num_classes, ‘pool’)
‘’’

    self.conv1 = SAGEConv(in_feats, 64, 'pool')
    self.conv2 = SAGEConv(64, 64 , 'pool')
    self.conv3 = SAGEConv(64, 32, 'pool')
    self.conv4 = SAGEConv(32,num_classes, 'pool')
def forward(self, g, in_feat):
    h = self.conv1(g, in_feat)
    h = F.relu(h)
    h = self.conv2(g, h)
    h = F.relu(h)
    h = self.conv3(g, h)
    h = F.relu(h)
    h = self.conv4(g, h)
    h = F.sigmoid(h)
    return h

Create the model with given dimensions

#model = GCN(g.ndata[‘feat’].shape[1], 16, dataset.num_classes)

def train(g, model):
for param in model.parameters():
param.requires_grad = True
loss_fn = nn.MSELoss()
#optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
optimizer = torch.optim.Adam(
[{“params”: model.parameters()}, {“params”: loss_fn.parameters()}], lr=0.001
)
best_val_acc = 0
best_test_acc = 0
model = model.double()
features = g.ndata[‘feature’]
labels = g.ndata[‘label’]
train_mask = g.ndata[‘train_mask’]
val_mask = g.ndata[‘val_mask’]
test_mask = g.ndata[‘test_mask’]

for e in range(100000):
    for param in model.parameters():
        param.requires_grad = True
    # Forward
    #optimizer.zero_grad()
    logits = model(g, features)

    # Compute prediction
    #pred = logits.argmax(1)

    # Compute loss
    # Note that you should only compute the losses of the nodes in the training set.
    
    #loss = F.binary_cross_entropy(logits[train_mask].double(), labels[train_mask].double())
    loss = F.mse_loss(logits[train_mask].double(), labels[train_mask].double())
    #loss_fn = nn.MSELoss()
    #logits = Variable(logits[train_mask].float(), requires_grad = True)
    #labels = Variable(labels[train_mask].float(), requires_grad = True)
    logits.requires_grad = True
    labels.requires_grad = True
    #logits.is_leaf = True
    #labels.is_leaf = True
    #loss = loss_fn(logits, labels)
    loss = Variable(loss, requires_grad = True)
    
    #loss = ((logits[train_mask] - labels[train_mask])**2).mean()
    
    #loss.is_leaf = True
    '''
    # Compute accuracy on training/validation/test
    #print(pred[train_mask].shape, labels[train_mask].shape)
    train_acc = (pred[train_mask] == labels[train_mask]).float().mean()
    val_acc = (pred[val_mask] == labels[val_mask]).float().mean()
    test_acc = (pred[test_mask] == labels[test_mask]).float().mean()

    '''
    # Compute accuracy on training/validation/test
    #print(pred[train_mask].shape, labels[train_mask].shape)
    train_acc = (logits[train_mask].argmax(1) == labels[train_mask].argmax(1)).float().mean()
    val_acc = (logits[val_mask].argmax(1) == labels[val_mask].argmax(1)).float().mean()
    test_acc = (logits[test_mask].argmax(1) == labels[test_mask].argmax(1)).float().mean()
    print("logits", logits[0])
    print("label", labels[0])

    # Save the best validation accuracy and the corresponding test accuracy.
    if best_val_acc < val_acc:
        best_val_acc = val_acc
        best_test_acc = test_acc

    # Backward
    optimizer.zero_grad()
    loss.requres_grad = True
    print(loss.is_leaf)
    loss.backward()
    optimizer.step()

    if e % 10 == 0:
        print('In epoch {}, loss: {:.3f}, val acc: {:.3f} (best {:.3f}), test acc: {:.3f} (best {:.3f})'.format(
            e, loss, val_acc, best_val_acc, test_acc, best_test_acc))

In epoch 0, loss: 0.248, val acc: 0.044 (best 0.044), test acc: 0.044 (best 0.044)
logits tensor([0.3738, 0.4152, 0.6884, 0.6958, 0.7550, 0.4365, 0.2546, 0.3509, 0.1485,
0.5741, 0.8709, 0.8646, 0.4125, 0.4890, 0.1682, 0.6156, 0.0968, 0.1036,
0.2117, 0.4285, 0.8276], dtype=torch.float64, requires_grad=True)
label tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0.], dtype=torch.float64, requires_grad=True)



In epoch 1760, loss: 0.248, val acc: 0.044 (best 0.044), test acc: 0.044 (best 0.044)
logits tensor([0.3738, 0.4152, 0.6884, 0.6958, 0.7550, 0.4365, 0.2546, 0.3509, 0.1485,
0.5741, 0.8709, 0.8646, 0.4125, 0.4890, 0.1682, 0.6156, 0.0968, 0.1036,
0.2117, 0.4285, 0.8276], dtype=torch.float64, requires_grad=True)
label tensor([1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
0., 0., 0.], dtype=torch.float64, requires_grad=True)
True

Hi Vilayannur!

I haven’t looked at your code in any detail, however:

“breaks the computation graph” because
Variable(loss, requires_grad = True) creates a new tensor
(that also happens to be called loss), and (by design) gradients
backpropagated to the new loss don’t get further backpropagated
through the old loss. (And creating the new loss with
requires_grad = True doesn’t fix this.)

(As an aside, Variable has been deprecated for some time now.
You should just use regular pytorch tensors.)

This suggests that you’ve been trying to fix things by turning various
tensor properties on and off. It might make sense to review some of
the basics of pytorch, in particular torch.autograd.

Best.

K. Frank