RuntimeError: The size of tensor a (77) must match the size of tensor b (64) at non-singleton dimension 2

My model architecture:

def _make_grid_(input):
    # Get the width and height of the output feature map
    _, height, width = input.size()

    # Determine the size of each grid cell
    grid_size = height // 4
    grid_cells = []
    boxes = torch.zeros(4, 4, 4)
    for i in range(4):
        for j in range(4):
            x1 = i * grid_size
            y1 = j * grid_size
            x2 = x1 + grid_size
            y2 = y1 + grid_size
            boxes[i, j, :] = torch.tensor([float(x1), float(y1), float(x2), float(y2)])
            grid_cell = input[:, x1:x2, y1:y2]
            grid_cells.append(grid_cell)
    grid_cells = torch.stack(grid_cells)
    return grid_cells, boxes


def _make_grid(input):
    # Get the width and height of the output feature map
    _, width, height = input.size()
    # Determine the size of each grid cell
    grid_size = height // 4
    grid_cells = []
    grid_coordinates = []
    for i in range(4):
        for j in range(4):
            # Calculate the coordinates for the current grid cell
            x1 = i * grid_size
            y1 = j * grid_size
            x2 = x1 + grid_size
            y2 = y1 + grid_size
            # Extract the region corresponding to the grid cell
            grid_cell = input[:, x1:x2, y1:y2]
            grid_coordinates.append(torch.tensor([float(x1),float(y1),float(x2),float(y2)]))
            grid_cells.append(grid_cell)
    grid_cells = torch.stack(grid_cells)
    grid_coordinates = torch.stack(grid_coordinates)
    return grid_cells, grid_coordinates


class Grid(torch.nn.Module):
    def __init__(self):
        super(Grid, self).__init__()

    def forward(self, x):
        res = []
        for img in x:
            res.append(_make_grid(img))
        return res


class CustomCLIP(torch.nn.Module):
  def __init__(self, num_classes: int = 10, bias=False):
    super().__init__()


    self.encoder = model.visual.float()
    self.conv1 = self.encoder.conv1
    self.bn1 = self.encoder.bn1
    self.relu1 = self.encoder.relu1
    self.avgpool = self.encoder.avgpool
    self.attnpool = self.encoder.attnpool
    self.text_encoder = self.encode_text
    self.fc = torch.nn.Linear(512, 4, bias=bias)
    self.relu = torch.nn.ReLU()

    self.grid = Grid()

    # add a bottleneck
    self.image_encoder = torch.nn.Sequential(
      self.conv1,
      self.bn1,
      self.relu1,
      torch.nn.Dropout(0.1),
      self.avgpool,
      torch.nn.Flatten(),
      torch.nn.Linear(20000, 4, bias=bias),
      self.relu,
    )



  def encode_text(self, text):
      x = model.token_embedding(text)
      x = x + model.positional_embedding
      x = model.ln_final(x)
      x = self.fc(x)
      x = self.relu(x)

      return x

  def forward(self, img, cap):
    grids = self.grid(img)
    aspect_ratio = 224 / 224
    target_width = 100
    target_height = int(target_width / aspect_ratio)
    similar = []
    for idx, (gd, _) in enumerate(grids):
      x_ = []
      for g in gd:
          x = F.interpolate(g.unsqueeze(0), size=(target_width, target_height), mode='bilinear', align_corners=False)
          with torch.no_grad():
            x = self.image_encoder(x)
          x /= x.norm(dim=-1, keepdim=True)
          x_.append(x)

      with torch.no_grad():
        y = self.text_encoder(cap[idx].unsqueeze(0))
      y /= y.norm(dim=-1, keepdim=True)


      im_ = torch.cat(x_)




      similarity = F.softmax(im_ @ y.mT, dim=0)

      max_value, max_index = torch.max(similarity, dim=0)
      boxes = grids[idx][1].to(device)
      similar.append(boxes[max_index])
    out_bbox = torch.cat(similar)



    return out_bbox

Getting error:

/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py:933: UserWarning: Using a target size (torch.Size([64, 4])) that is different to the input size (torch.Size([10304, 16, 77, 4])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
  return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-18-263240bbee7e> in <cell line: 1>()
----> 1 main()

6 frames
/usr/local/lib/python3.10/dist-packages/torch/functional.py in broadcast_tensors(*tensors)
     71     if has_torch_function(tensors):
     72         return handle_torch_function(broadcast_tensors, tensors, *tensors)
---> 73     return _VF.broadcast_tensors(tensors)  # type: ignore[attr-defined]
     74 
     75 

RuntimeError: The size of tensor a (77) must match the size of tensor b (64) at non-singleton dimension 2

This is a model for visual grounding task. how to change match the both shapes of output and target?

you seem to have two functions with similar names _make_grid_ and _make_grid.
You can delete _make_grid_ as it doesn’t seem to be used anywhere else.

About the error message, it looks like you need to modify _make_grid to output 64 sized patches instead of what they are outputing now which is 77 sized patches.
Alternatively, you can send in an img into CustomCLIP’s forward that is correctly sized.

Thank you. I solved it. As you said, had to modify the _make_grid() function to solve this.