Extreme GPU memory usage

I am getting some wild memory usage using the code below.

class MyDataSet(Dataset):
    def __init__(self, root_dir, dataframe):
        self.root_dir = root_dir
        self.dataframe = dataframe
        self.images = [img[:-4] for img in os.listdir(root_dir) if img[-4:] == ".jpg"]

    def __len__(self):
        return len(self.images)

    def __getitem__(self, index):
        image_path = os.path.join(self.root_dir, f"{self.images[index]}.jpg")
        image = v2.functional.convert_image_dtype(read_image(image_path)).to(device)
        boxes = self.dataframe.loc[self.dataframe.page_id == self.images[index]]
        
        target = {}
        target["boxes"] = tv_tensors.BoundingBoxes(torch.tensor(boxes[["x", "y", "x1", "y1"]].values), format="XYXY", canvas_size=v2.functional.get_size(image), device=device)
        target["labels"] = torch.ones(len(boxes), dtype=torch.int64, device=device)
        
        return image, target

model = fasterrcnn_resnet50_fpn_v2(num_classes=classes)
in_features = model.roi_heads.box_predictor.cls_score.in_features
model.roi_heads.box_predictor = FastRCNNPredictor(in_features, classes)
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001, weight_decay=0.0001)

def collate_fn(batch):
    return list(zip(*batch))

training_data, test_data = random_split(dataset, [0.75, 0.25])
train_loader = DataLoader(training_data, batch_size=2, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(test_data, batch_size=2, collate_fn=collate_fn, shuffle=True)

def train(num_epochs):
    best_accuracy = 0.0

    for epoch in range(num_epochs):  # one epoch = the entire dataset one time
        running_loss = 0.0
        running_acc = 0.0

        for i, (images, targets) in enumerate(train_loader, 0):  
            optimizer.zero_grad()
            outputs = model(images, targets)
            loss = sum(loss for loss in outputs.values())
            #loss = criterion(outputs["loss_classifier"], targets)

            loss.backward()
            optimizer.step()
            running_loss += loss.item()
            if i % 1000 == 999:    
                print('[%d, %5d] loss: %.3f' %
                      (epoch + 1, i + 1, running_loss / 1000))
                running_loss = 0.0

train(1)

The root folder directory contains 90MB worth of images. The instant outputs = model(images, targets) is called, gpu memory usage spikes to almost 100% (16gb available). What am I doing wrong here?

Besides the inputs and model parameters the intermediate activations will also be saved during the training so you might want to check this post.