Help debugging code: RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

Do_Thanh_Dat_LE · January 2, 2024, 8:52pm

I am trying to implement the Momentum Contrast (MoCo) from an article to train with unlabeled images in MNIST handwritten digits data. I use the class MoCo defined in the code below to train but I got the error:

RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn.

I checked and saw that the attribute requires_grad of the output of function forward() in the MoCo class is False, so the loss.backward() could not operate. However, I do not know how to fix this error ?

# MoCo builder
class MoCo(nn.Module):

  def __init__(self, base_encoder, dim=128, K=65536, m=0.999, T=0.07, mlp=False, device = device):
    """
    dim: feature dimension (default: 128)
    K: queue size; number of negative keys (default: 65536)
    m: moco momentum of updating key encoder (default: 0.999)
    T: softmax temperature (default: 0.07)
    """
    super(MoCo, self).__init__()

    self.K = K
    self.m = m
    self.T = T

    # create the encoders
    self.encoder_q = base_encoder
    self.encoder_k = base_encoder

    if mlp:
      dim_mlp = self.encoder_q.fc.weight.shape[1]
      self.encoder_q.fc = nn.Sequential(
        nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_q.fc
        )
      self.encoder_k.fc = nn.Sequential(
          nn.Linear(dim_mlp, dim_mlp), nn.ReLU(), self.encoder_k.fc
          )

    for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
      param_k.data.copy_(param_q.data)  # initialize
      param_k.requires_grad = False  # not update by gradient

    # create the queue
    self.register_buffer("queue", torch.randn(dim, K))
    self.queue = nn.functional.normalize(self.queue, dim=0)
    self.register_buffer("queue_ptr", torch.zeros(1, dtype=torch.long))

  @torch.no_grad()
  def _momentum_update_key_encoder(self):
    """
    Momentum update of the key encoder
    """
    for param_q, param_k in zip(self.encoder_q.parameters(), self.encoder_k.parameters()):
      param_k.data = param_k.data * self.m + param_q.data * (1.0 - self.m)

  @torch.no_grad()
  def _dequeue_and_enqueue(self, keys):
    batch_size = keys.shape[0]

    ptr = int(self.queue_ptr)
    assert self.K % batch_size == 0  # for simplicity

    # replace the keys at ptr (dequeue and enqueue)
    self.queue[:, ptr : ptr + batch_size] = keys.T
    ptr = (ptr + batch_size) % self.K  # move pointer

    self.queue_ptr[0] = ptr

  def forward(self, im_q, im_k):
    """
    Input:
      im_q: a batch of query images
      im_k: a batch of key images
    Output:
      logits, targets
    """

    # compute query features
    q = self.encoder_q(im_q)  # queries: NxC
    q = nn.functional.normalize(q, dim=1)

    # compute key features
    with torch.no_grad():
      self._momentum_update_key_encoder()  # update the key encoder
      k = self.encoder_k(im_k)  # keys: NxC
      k = nn.functional.normalize(k, dim=1)


    # compute logits
    # Einstein sum is more intuitive
    # positive logits: Nx1
    l_pos = torch.einsum("nc,nc->n", [q, k]).unsqueeze(-1)
    # negative logits: NxK
    l_neg = torch.einsum("nc,ck->nk", [q, self.queue.clone().detach()])

    # logits: Nx(1+K)
    logits = torch.cat([l_pos, l_neg], dim=1)

    # apply temperature
    logits /= self.T



    # labels: positive key indicators
    labels = torch.zeros(logits.shape[0], dtype=torch.long).to(device)

    # dequeue and enqueue
    self._dequeue_and_enqueue(k)

    return logits, labels

ptrblck · January 3, 2024, 1:31am

Could you check the .grad_fn attribute of intermediate activation tensors in your forward to see which tensor was detached in which operation?

Artinm89 · December 20, 2024, 11:31am

im getting the same error and i don’t know why it doesn’t make any sense we cant just simply set the require_gradient to true it won’t be MoCo anymore

ptrblck · December 20, 2024, 2:03pm

Same advice as mentioned before: check which intermediate activation loses its .grad_fn to isolate which operation detaches the tensor from the computation graph.