Model:
def _make_grid(input):
# Get the width and height of the output feature map
_, width, height = input.size()
# Determine the size of each grid cell
grid_size = height // 16
grid_cells = []
grid_coordinates = []
for i in range(16):
for j in range(16):
# Calculate the coordinates for the current grid cell
x1 = i * grid_size
y1 = j * grid_size
x2 = x1 + grid_size
y2 = y1 + grid_size
# Extract the region corresponding to the grid cell
grid_cell = input[:, x1:x2, y1:y2]
grid_coordinates.append(torch.tensor([float(x1),float(y1),float(x2),float(y2)]))
grid_cells.append(grid_cell)
grid_cells = torch.stack(grid_cells)
grid_coordinates = torch.stack(grid_coordinates)
return grid_cells, grid_coordinates
class Grid(torch.nn.Module):
def __init__(self):
super(Grid, self).__init__()
def forward(self, x):
res = []
for img in x:
res.append(_make_grid(img))
return res
class CustomCLIP(torch.nn.Module):
def __init__(self, model, num_classes: int = 4, bias=False):
super().__init__()
self.model = model
self.num_classes = num_classes
self.encoder = self.model.visual.float()
self.conv1 = self.encoder.conv1
self.bn1 = self.encoder.bn1
self.relu1 = self.encoder.relu1
self.avgpool = self.encoder.avgpool
self.fc = torch.nn.Linear(512, self.num_classes, bias=bias)
self.relu = torch.nn.ReLU()
self.text_encoder = self.encode_text
self.grid = Grid()
# add a bottleneck
self.image_encoder = torch.nn.Sequential(
self.conv1,
self.bn1,
self.relu1,
self.avgpool,
torch.nn.Flatten(),
torch.nn.Linear(288, self.num_classes, bias=bias),
)
def encode_text(self, text):
x = self.model.token_embedding(text)
x = self.fc(x)
x = x[:, :, :, :4]
return x
def forward(self, img, cap):
grids = self.grid(img)
aspect_ratio = 224 / 224
target_width = 224 // 16
target_height = int(target_width / aspect_ratio)
similar = []
for idx, (gd, _) in enumerate(grids):
x_ = []
for g in gd:
x = F.interpolate(g.unsqueeze(0), size=(target_width, target_height), mode='bilinear', align_corners=False)
x = self.image_encoder(x)
x /= x.norm(dim=-1, keepdim=True)
x_.append(x)
#with torch.no_grad():
y = self.text_encoder(cap[idx].unsqueeze(0))
y /= y.norm(dim=-1, keepdim=True)
im_ = torch.cat(x_)
target_len = y.size(2) - im_.size(0)
im_ = F.pad(im_, (0,0,0,target_len), value=0)
similarity_scores = torch.nn.functional.cosine_similarity(im_, y, dim=-1)
max_values, max_indices = torch.max(similarity_scores, dim=1)
boxes = grids[idx][1]
selected_box = boxes[max_indices.to(boxes.device)]
similar.append(selected_box)
out_bbox = torch.cat(similar)
out_bbox = out_bbox[:, 0, :]
return out_bbox
Training step:
def training_step(net, data_loader, optimizer, cost_function, device='cuda:0'):
samples = 0.0
cumulative_loss = 0.0
cumulative_accuracy = 0.0
batch_ious = []
# set the network to training mode
net = net.float()
net.train()
accumulation_steps = 10
clip_value = 3.0
scaler = GradScaler()
# iterate over the training set
for batch_idx, batch in enumerate(data_loader):
#load data into GPU
batch['image'] = batch['image'].to(device)
batch['captions'] = batch['captions'].to(device)
target_bbox = batch['bbox'].to(device)
# forward pass
with autocast():
out_bbox = net(batch['image'], batch['captions'])
out_bbox = out_bbox.to(device)
# loss computation
loss = cost_function(out_bbox, target_bbox)
loss = loss/accumulation_steps
loss.requires_grad=True
#backward-pass
scaler.scale(loss).backward()
if (batch_idx + 1) % accumulation_steps == 0:
# parameters update
# Gradient scaling and optimization step
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(net.parameters(), clip_value)
scaler.step(optimizer)
scaler.update()
# gradients reset
optimizer.zero_grad()
# fetch prediction and loss value
samples += batch['bbox'].shape[0]
cumulative_loss += loss.item()
iou_accuracy = torchvision.ops.box_iou(target_bbox, out_bbox).detach().cpu().numpy().sum().item()
batch_ious.append(iou_accuracy)
torch.cuda.empty_cache()
# Calculate the average IoU across all batches
if batch_ious:
# Calculate the average IoU across all batches
cumulative_accuracy = np.mean(batch_ious)
else:
cumulative_accuracy = 0.0 # Set to zero if there are no elements
return cumulative_loss/samples, cumulative_accuracy
This error cant seem to go away. What am I doing wrong?
P.S. Batch size=100 training data=4000
AssertionError Traceback (most recent call last)
Cell In[13], line 1
----> 1 main()
Cell In[12], line 56, in main(root, data_dir, batch_size, test_batch_size, num_classes, learning_rate, weight_decay, momentum, epochs)
53 # for each epoch, train the network and then compute evaluation results
54 for e in range(epochs):
---> 56 train_loss, train_accuracy = training_step(modified_model, train_loader, optimizer, cost_function)
57 torch.cuda.empty_cache()
58 val_loss, val_accuracy = test_step(modified_model, val_loader, cost_function)
Cell In[8], line 35, in training_step(net, data_loader, optimizer, cost_function, device)
28 scaler.scale(loss).backward()
30 if (batch_idx + 1) % accumulation_steps == 0:
31 # parameters update
32 # Gradient scaling and optimization step
33 #scaler.unscale_(optimizer)
34 #torch.nn.utils.clip_grad_norm_(net.parameters(), clip_value) #ommit this
---> 35 scaler.step(optimizer)
36 scaler.update()
37 # gradients reset
File ~/miniconda3/lib/python3.10/site-packages/torch/cuda/amp/grad_scaler.py:372, in GradScaler.step(self, optimizer, *args, **kwargs)
369 if optimizer_state["stage"] is OptState.READY:
370 self.unscale_(optimizer)
--> 372 assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer."
374 retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
376 optimizer_state["stage"] = OptState.STEPPED
AssertionError: No inf checks were recorded for this optimizer.