No inf checks were recorded for this optimizer mixed precision error

Model:

def _make_grid(input):
    # Get the width and height of the output feature map
    _, width, height = input.size()
    # Determine the size of each grid cell
    grid_size = height // 16
    grid_cells = []
    grid_coordinates = []
    for i in range(16):
        for j in range(16):
            # Calculate the coordinates for the current grid cell
            x1 = i * grid_size
            y1 = j * grid_size
            x2 = x1 + grid_size
            y2 = y1 + grid_size
            # Extract the region corresponding to the grid cell
            grid_cell = input[:, x1:x2, y1:y2]
            grid_coordinates.append(torch.tensor([float(x1),float(y1),float(x2),float(y2)]))
            grid_cells.append(grid_cell)
    grid_cells = torch.stack(grid_cells)
    grid_coordinates = torch.stack(grid_coordinates)
    return grid_cells, grid_coordinates


class Grid(torch.nn.Module):
    def __init__(self):
        super(Grid, self).__init__()

    def forward(self, x):
        res = []
        for img in x:
            res.append(_make_grid(img))
        return res


class CustomCLIP(torch.nn.Module):
  def __init__(self, model, num_classes: int = 4, bias=False):
    super().__init__()
    self.model = model
    self.num_classes = num_classes
    self.encoder = self.model.visual.float()
    self.conv1 = self.encoder.conv1
    self.bn1 = self.encoder.bn1
    self.relu1 = self.encoder.relu1
    self.avgpool = self.encoder.avgpool
    self.fc = torch.nn.Linear(512, self.num_classes, bias=bias)
    self.relu = torch.nn.ReLU()


    self.text_encoder = self.encode_text

    self.grid = Grid()

    # add a bottleneck
    self.image_encoder = torch.nn.Sequential(
      self.conv1,
      self.bn1,
      self.relu1,
      self.avgpool,
      torch.nn.Flatten(),
      torch.nn.Linear(288, self.num_classes, bias=bias),
    )


  def encode_text(self, text):
      x = self.model.token_embedding(text)
      x = self.fc(x)
      x = x[:, :, :, :4]
      return x

  def forward(self, img, cap):
    grids = self.grid(img)
    aspect_ratio = 224 / 224
    target_width = 224 // 16
    target_height = int(target_width / aspect_ratio)
    similar = []
    for idx, (gd, _) in enumerate(grids):
      x_ = []
      for g in gd:
          x = F.interpolate(g.unsqueeze(0), size=(target_width, target_height), mode='bilinear', align_corners=False)
          x = self.image_encoder(x)
          x /= x.norm(dim=-1, keepdim=True)
          x_.append(x)
      #with torch.no_grad():
      y = self.text_encoder(cap[idx].unsqueeze(0))
      y /= y.norm(dim=-1, keepdim=True)


      im_ = torch.cat(x_)

      target_len = y.size(2) - im_.size(0)
      im_ = F.pad(im_, (0,0,0,target_len), value=0)


      similarity_scores = torch.nn.functional.cosine_similarity(im_, y, dim=-1)
      max_values, max_indices = torch.max(similarity_scores, dim=1)
      boxes = grids[idx][1]
      selected_box = boxes[max_indices.to(boxes.device)]

      similar.append(selected_box)

    out_bbox = torch.cat(similar)
    out_bbox = out_bbox[:, 0, :]

    return out_bbox

Training step:

def training_step(net, data_loader, optimizer, cost_function, device='cuda:0'):
    samples = 0.0
    cumulative_loss = 0.0
    cumulative_accuracy = 0.0
    batch_ious = []
    # set the network to training mode
    net = net.float()
    net.train()
    accumulation_steps = 10
    clip_value = 3.0
    scaler = GradScaler()
    # iterate over the training set
    for batch_idx, batch in enumerate(data_loader):

        #load data into GPU
        batch['image'] = batch['image'].to(device)
        batch['captions'] = batch['captions'].to(device)
        target_bbox = batch['bbox'].to(device)
        # forward pass
        with autocast():
            out_bbox = net(batch['image'], batch['captions'])
            out_bbox = out_bbox.to(device)
            # loss computation
            loss = cost_function(out_bbox, target_bbox)
            loss = loss/accumulation_steps
            loss.requires_grad=True
        #backward-pass    
        scaler.scale(loss).backward()

        if (batch_idx + 1) % accumulation_steps == 0:
            # parameters update
            # Gradient scaling and optimization step
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(net.parameters(), clip_value)
            scaler.step(optimizer) 
            scaler.update()
            # gradients reset
            optimizer.zero_grad()
            
            
        # fetch prediction and loss value
        samples += batch['bbox'].shape[0]

        cumulative_loss += loss.item()

        iou_accuracy = torchvision.ops.box_iou(target_bbox, out_bbox).detach().cpu().numpy().sum().item()
        batch_ious.append(iou_accuracy)
            
        torch.cuda.empty_cache()
            
    # Calculate the average IoU across all batches
    if batch_ious:
        # Calculate the average IoU across all batches
        cumulative_accuracy = np.mean(batch_ious)
    else:
        cumulative_accuracy = 0.0 # Set to zero if there are no elements


    return cumulative_loss/samples, cumulative_accuracy

This error cant seem to go away. What am I doing wrong?
P.S. Batch size=100 training data=4000

AssertionError                            Traceback (most recent call last)
Cell In[13], line 1
----> 1 main()

Cell In[12], line 56, in main(root, data_dir, batch_size, test_batch_size, num_classes, learning_rate, weight_decay, momentum, epochs)
     53 # for each epoch, train the network and then compute evaluation results
     54 for e in range(epochs):
---> 56     train_loss, train_accuracy = training_step(modified_model, train_loader, optimizer, cost_function)
     57     torch.cuda.empty_cache()
     58     val_loss, val_accuracy = test_step(modified_model, val_loader, cost_function)

Cell In[8], line 35, in training_step(net, data_loader, optimizer, cost_function, device)
     28 scaler.scale(loss).backward()
     30 if (batch_idx + 1) % accumulation_steps == 0:
     31     # parameters update
     32     # Gradient scaling and optimization step
     33     #scaler.unscale_(optimizer)
     34     #torch.nn.utils.clip_grad_norm_(net.parameters(), clip_value) #ommit this
---> 35     scaler.step(optimizer) 
     36     scaler.update()
     37     # gradients reset

File ~/miniconda3/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:372, in GradScaler.step(self, optimizer, *args, **kwargs)
    369 if optimizer_state["stage"] is OptState.READY:
    370     self.unscale_(optimizer)
--> 372 assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer."
    374 retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
    376 optimizer_state["stage"] = OptState.STEPPED

AssertionError: No inf checks were recorded for this optimizer.

Could you explain why loss.requires_grad=True is used?
Did you detach the loss and tried to work around the valid error pointing out no computation graph was created by setting required_grad=True on the detached tensor? If so, this would explain the error and you should fix the original issue of detaching loss (or any previous tensor) from the computation graph.

I put requires_grad=True to loss as I was having this error.

# [RuntimeError: element 0 of variables does not require grad and does not have a grad_fn]

And I couldnt find out where did any of the variables went detached of computation graph. Can you please explain when does this error occur in common?

Tensors are detached in a few different ways:

  • you can detach tensors explicitly by calling x = x.detach() on them
  • 3rd party libraries are not tracked by Autograd, so if you use e.g. numpy and transform the np.array back to a tensor, the tensor will be detached
  • some operations are not differentiable, such as torch.argmax and thus detach the output.

Hello. I can’t pass by this situation. There are some processing which needs numpy array. So I am keeping this requires_grad line. And, can you please help on the problem this post is on? Why infinite gradients are not being recorded by the scaler? and a possible solution? Thank you.

AMP is just reraising the underlying issue of no gradients being computed, so you would need to fix the detaching issue. If you need to use np.arrays you would need to write a custom autograd.Function as described here.

Hello. Thank you. Is there a documentation where I can find the operations that make the tensors detached ?

I’m not aware of a collection listing all PyTorch operations detaching tensors from a computation graph, but you could check if by printing the .grad_fn of the output tensor as described before.
However, based on your description you are using a 3rd party library, numpy in this case, which won’t be tracked by Autograd.