My model architecture:

```
def _make_grid_(input):
# Get the width and height of the output feature map
_, height, width = input.size()
# Determine the size of each grid cell
grid_size = height // 4
grid_cells = []
boxes = torch.zeros(4, 4, 4)
for i in range(4):
for j in range(4):
x1 = i * grid_size
y1 = j * grid_size
x2 = x1 + grid_size
y2 = y1 + grid_size
boxes[i, j, :] = torch.tensor([float(x1), float(y1), float(x2), float(y2)])
grid_cell = input[:, x1:x2, y1:y2]
grid_cells.append(grid_cell)
grid_cells = torch.stack(grid_cells)
return grid_cells, boxes
def _make_grid(input):
# Get the width and height of the output feature map
_, width, height = input.size()
# Determine the size of each grid cell
grid_size = height // 4
grid_cells = []
grid_coordinates = []
for i in range(4):
for j in range(4):
# Calculate the coordinates for the current grid cell
x1 = i * grid_size
y1 = j * grid_size
x2 = x1 + grid_size
y2 = y1 + grid_size
# Extract the region corresponding to the grid cell
grid_cell = input[:, x1:x2, y1:y2]
grid_coordinates.append(torch.tensor([float(x1),float(y1),float(x2),float(y2)]))
grid_cells.append(grid_cell)
grid_cells = torch.stack(grid_cells)
grid_coordinates = torch.stack(grid_coordinates)
return grid_cells, grid_coordinates
class Grid(torch.nn.Module):
def __init__(self):
super(Grid, self).__init__()
def forward(self, x):
res = []
for img in x:
res.append(_make_grid(img))
return res
class CustomCLIP(torch.nn.Module):
def __init__(self, num_classes: int = 10, bias=False):
super().__init__()
self.encoder = model.visual.float()
self.conv1 = self.encoder.conv1
self.bn1 = self.encoder.bn1
self.relu1 = self.encoder.relu1
self.avgpool = self.encoder.avgpool
self.attnpool = self.encoder.attnpool
self.text_encoder = self.encode_text
self.fc = torch.nn.Linear(512, 4, bias=bias)
self.relu = torch.nn.ReLU()
self.grid = Grid()
# add a bottleneck
self.image_encoder = torch.nn.Sequential(
self.conv1,
self.bn1,
self.relu1,
torch.nn.Dropout(0.1),
self.avgpool,
torch.nn.Flatten(),
torch.nn.Linear(20000, 4, bias=bias),
self.relu,
)
def encode_text(self, text):
x = model.token_embedding(text)
x = x + model.positional_embedding
x = model.ln_final(x)
x = self.fc(x)
x = self.relu(x)
return x
def forward(self, img, cap):
grids = self.grid(img)
aspect_ratio = 224 / 224
target_width = 100
target_height = int(target_width / aspect_ratio)
similar = []
for idx, (gd, _) in enumerate(grids):
x_ = []
for g in gd:
x = F.interpolate(g.unsqueeze(0), size=(target_width, target_height), mode='bilinear', align_corners=False)
with torch.no_grad():
x = self.image_encoder(x)
x /= x.norm(dim=-1, keepdim=True)
x_.append(x)
with torch.no_grad():
y = self.text_encoder(cap[idx].unsqueeze(0))
y /= y.norm(dim=-1, keepdim=True)
im_ = torch.cat(x_)
similarity = F.softmax(im_ @ y.mT, dim=0)
max_value, max_index = torch.max(similarity, dim=0)
boxes = grids[idx][1].to(device)
similar.append(boxes[max_index])
out_bbox = torch.cat(similar)
return out_bbox
```

Getting error:

```
/usr/local/lib/python3.10/dist-packages/torch/nn/modules/loss.py:933: UserWarning: Using a target size (torch.Size([64, 4])) that is different to the input size (torch.Size([10304, 16, 77, 4])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.smooth_l1_loss(input, target, reduction=self.reduction, beta=self.beta)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-18-263240bbee7e> in <cell line: 1>()
----> 1 main()
6 frames
/usr/local/lib/python3.10/dist-packages/torch/functional.py in broadcast_tensors(*tensors)
71 if has_torch_function(tensors):
72 return handle_torch_function(broadcast_tensors, tensors, *tensors)
---> 73 return _VF.broadcast_tensors(tensors) # type: ignore[attr-defined]
74
75
RuntimeError: The size of tensor a (77) must match the size of tensor b (64) at non-singleton dimension 2
```

This is a model for visual grounding task. how to change match the both shapes of output and target?