Grad is None for nn.Parameter

tsly123 · February 22, 2023, 2:45am

I try to understand this Grad computation with another problem. This time the grad is all 0.

In the code below, I want to do convex combination of tensors [n_p, len, dim] and 1-dimension alpha tensor [n_p] by broadcasting.

After backward(), only the tensors [n_p, len, dim] have gradient but the alpha tensor [n_p] grad is all 0.

I found this Pytorch forum discussion that is similar to the problem that I have here but I still dont understand why the grad is all 0.

This code is similar to this MoCo github which also used torch.einsum to compute logits for the loss’s input

import torch
import torch.nn as nn
from torch.nn import functional as F

def forward_mprompts(emb, emb_deep, alphas, alphas_deep):
    '''
    convex combination
    :param emb: [n_p, len, dim]
    :param emb_deep: [n_p, n_layers, len, dim]
    :param alphas: [n_p]
    :param alphas_deep: [n_p]
    :return: out_emb [1, len, dim], out_emb_deep [n_layers, len, dim]
    '''

    out_emb = torch.einsum('ijk,i->ijk', emb, F.softmax(alphas)).sum(0).unsqueeze(0)

    out_emb_deep = torch.einsum('ijkf,i->ijkf', emb_deep, F.softmax(alphas_deep)).sum(0)

    return out_emb, out_emb_deep

def MSE_loss(x, y):
    return F.mse_loss(x,y)

n_p= 5
n_ctx = 3
dim = 4
n_layers = 2

emb = nn.Parameter(torch.zeros(n_p, n_ctx, dim))

alphas = nn.Parameter(torch.rand(n_p))

emb_deep = nn.Parameter(torch.zeros(n_p, n_layers - 1, n_ctx, dim))

alphas_deep = nn.Parameter(torch.rand(n_p))

out_emb, out_emb_deep = forward_mprompts(emb, emb_deep, alphas, alphas_deep)

y_emb = torch.rand(1, n_ctx, dim)
y_emb_deep = torch.rand(n_layers - 1, n_ctx, dim)

loss = MSE_loss(out_emb, y_emb)
loss_deep = MSE_loss(out_emb_deep, y_emb_deep)

print(alphas.is_leaf)   # True
alphas.retain_grad()

print(alphas_deep.is_leaf)  # True
alphas_deep.retain_grad()

loss.backward()
loss_deep.backward()

print(emb.grad)      # Tensor torch.Size([5, 3, 4])
print(emb_deep.grad)  # Tensor torch.Size([5, 2, 3, 4])
print(alphas.grad)    # tensor([0., 0., 0., 0., 0.])
print(alphas_deep.grad)   # tensor([0., 0., 0., 0., 0.])

In addition, the code above also gives the warning

UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.
  out_emb = torch.einsum('ijk,i->ijk', [emb, F.softmax(alphas)]).sum(0).unsqueeze(0)

Thank you