CUDA runs out of memory after some steps

I have the following code, but CUDA keeps running out of memory after a few steps (can be 3 steps, can be 50 steps). Is there something I’m doing wrong?

import torch
from torch import Tensor
from torch.nn import BCELoss
from torch.optim import Optimizer
from torch.utils.data import DataLoader

def _evaluate_single_batch(self, batch: tuple[Tensor]) -> Tensor:
    input_ids = batch[0]
    attention_mask = batch[1]
    number_of_chunks = [len(x) for x in input_ids]

    input_ids_combined = []
    for x in input_ids:
        input_ids_combined.extend(x.tolist())
    input_ids_combined_tensors = torch.stack([torch.tensor(x) for x in input_ids_combined]).to(self.device)

    attention_mask_combined = []
    for x in attention_mask:
        attention_mask_combined.extend(x.tolist())
    attention_mask_combined_tensors = torch.stack([torch.tensor(x) for x in attention_mask_combined]).to(self.device)

    preds_out = self.neural_network(input_ids_combined_tensors, attention_mask_combined_tensors)
    preds = preds_out.flatten().cpu()
    del input_ids_combined_tensors, attention_mask_combined_tensors, preds_out
    torch.cuda.empty_cache()

    preds_split = preds.split(number_of_chunks)
    pooled_preds = torch.cat([torch.mean(x).reshape(1) for x in preds_split])

    return pooled_preds

def _train_single_epoch(self, dataloader: DataLoader, optimizer: Optimizer) -> None:
    self.neural_network.train()
    cross_entropy = BCELoss()

    for step, batch in enumerate(dataloader):

        labels = batch[-1].float().cpu()
        predictions = self._evaluate_single_batch(batch)
        loss = cross_entropy(predictions, labels) / self.accumulation_steps
        loss.backward()

        if ((step + 1) % self.accumulation_steps == 0) or (step + 1 == len(dataloader)):
            optimizer.step()
            optimizer.zero_grad()
            torch.cuda.empty_cache()

        del batch, labels, predictions, loss

You could check the shape of all input batches and see if the OOM error is raised for a specifically large input.