Cuda Out of memory error on Squad

Hello all,
I am fine tuning T5 Conditional Generation model with squad dataset on kaggle with gpu.
But i am getting this cuda out of memory error for the following code

class SquadQAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = Model


    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids, 
            attention_mask=attention_mask,
            labels=labels)

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self.forward(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self.forward(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self.forward(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=0.0001)
        return optimizer
    
    def training_epoch_end(self, outputs):
        print("Epoch {} =======>>>> {}".format(self.trainer.current_epoch, self.trainer.progress_bar_dict  ))

    def validation_epoch_end(self, outputs):
        print('EPOCH {} ======={}>>>>'.format(self.trainer.current_epoch, self.trainer.callback_metrics))

can anyone please see into it ? Why the 16 GB gpu keeps feeling after loading every thing on cuda device

The memory footprint of your training seems to be too high for 16GB so you could reduce the batch size, use a smaller model, or use e.g. torch.utils.checkpoint to trade compute for memory.