# RuntimeError: The size of tensor a (77) must match the size of tensor b (64) at non-singleton dimension 2

My model architecture:

``````def _make_grid_(input):
# Get the width and height of the output feature map
_, height, width = input.size()

# Determine the size of each grid cell
grid_size = height // 4
grid_cells = []
boxes = torch.zeros(4, 4, 4)
for i in range(4):
for j in range(4):
x1 = i * grid_size
y1 = j * grid_size
x2 = x1 + grid_size
y2 = y1 + grid_size
boxes[i, j, :] = torch.tensor([float(x1), float(y1), float(x2), float(y2)])
grid_cell = input[:, x1:x2, y1:y2]
grid_cells.append(grid_cell)
grid_cells = torch.stack(grid_cells)
return grid_cells, boxes

def _make_grid(input):
# Get the width and height of the output feature map
_, width, height = input.size()
# Determine the size of each grid cell
grid_size = height // 4
grid_cells = []
grid_coordinates = []
for i in range(4):
for j in range(4):
# Calculate the coordinates for the current grid cell
x1 = i * grid_size
y1 = j * grid_size
x2 = x1 + grid_size
y2 = y1 + grid_size
# Extract the region corresponding to the grid cell
grid_cell = input[:, x1:x2, y1:y2]
grid_coordinates.append(torch.tensor([float(x1),float(y1),float(x2),float(y2)]))
grid_cells.append(grid_cell)
grid_cells = torch.stack(grid_cells)
grid_coordinates = torch.stack(grid_coordinates)
return grid_cells, grid_coordinates

class Grid(torch.nn.Module):
def __init__(self):
super(Grid, self).__init__()

def forward(self, x):
res = []
for img in x:
res.append(_make_grid(img))
return res

class CustomCLIP(torch.nn.Module):
def __init__(self, num_classes: int = 10, bias=False):
super().__init__()

self.encoder = model.visual.float()
self.conv1 = self.encoder.conv1
self.bn1 = self.encoder.bn1
self.relu1 = self.encoder.relu1
self.avgpool = self.encoder.avgpool
self.attnpool = self.encoder.attnpool
self.text_encoder = self.encode_text
self.fc = torch.nn.Linear(512, 4, bias=bias)
self.relu = torch.nn.ReLU()

self.grid = Grid()

self.image_encoder = torch.nn.Sequential(
self.conv1,
self.bn1,
self.relu1,
torch.nn.Dropout(0.1),
self.avgpool,
torch.nn.Flatten(),
torch.nn.Linear(20000, 4, bias=bias),
self.relu,
)

def encode_text(self, text):
x = model.token_embedding(text)
x = x + model.positional_embedding
x = model.ln_final(x)
x = self.fc(x)
x = self.relu(x)

return x

def forward(self, img, cap):
grids = self.grid(img)
aspect_ratio = 224 / 224
target_width = 100
target_height = int(target_width / aspect_ratio)
similar = []
for idx, (gd, _) in enumerate(grids):
x_ = []
for g in gd:
x = F.interpolate(g.unsqueeze(0), size=(target_width, target_height), mode='bilinear', align_corners=False)
x = self.image_encoder(x)
x /= x.norm(dim=-1, keepdim=True)
x_.append(x)

y = self.text_encoder(cap[idx].unsqueeze(0))
y /= y.norm(dim=-1, keepdim=True)

im_ = torch.cat(x_)

similarity = F.softmax(im_ @ y.mT, dim=0)

max_value, max_index = torch.max(similarity, dim=0)
boxes = grids[idx][1].to(device)
similar.append(boxes[max_index])
out_bbox = torch.cat(similar)

return out_bbox
``````

Getting error:

``````/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py:933: UserWarning: Using a target size (torch.Size([64, 4])) that is different to the input size (torch.Size([10304, 16, 77, 4])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-18-263240bbee7e> in <cell line: 1>()
----> 1 main()

6 frames
71     if has_torch_function(tensors):
---> 73     return _VF.broadcast_tensors(tensors)  # type: ignore[attr-defined]
74
75

RuntimeError: The size of tensor a (77) must match the size of tensor b (64) at non-singleton dimension 2
``````

This is a model for visual grounding task. how to change match the both shapes of output and target?

you seem to have two functions with similar names `_make_grid_` and `_make_grid`.
You can delete `_make_grid_` as it doesn’t seem to be used anywhere else.

About the error message, it looks like you need to modify `_make_grid` to output `64` sized patches instead of what they are outputing now which is `77` sized patches.
Alternatively, you can send in an `img` into CustomCLIP’s `forward` that is correctly sized.

Thank you. I solved it. As you said, had to modify the `_make_grid()` function to solve this.