Cuda Out of memory error on Squad

Shivam_Marathe · April 24, 2021, 6:15am

Hello all,
I am fine tuning T5 Conditional Generation model with squad dataset on kaggle with gpu.
But i am getting this cuda out of memory error for the following code

class SquadQAModel(pl.LightningModule):
    def __init__(self):
        super().__init__()
        self.model = Model


    def forward(self, input_ids, attention_mask, labels=None):
        output = self.model(
            input_ids, 
            attention_mask=attention_mask,
            labels=labels)

        return output.loss, output.logits

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self.forward(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions":outputs, "labels": labels}

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self.forward(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss

    def test_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask=batch['attention_mask']
        labels = batch['labels']
        loss, outputs = self.forward(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss

    def configure_optimizers(self):
        optimizer = AdamW(self.model.parameters(), lr=0.0001)
        return optimizer
    
    def training_epoch_end(self, outputs):
        print("Epoch {} =======>>>> {}".format(self.trainer.current_epoch, self.trainer.progress_bar_dict  ))

    def validation_epoch_end(self, outputs):
        print('EPOCH {} ======={}>>>>'.format(self.trainer.current_epoch, self.trainer.callback_metrics))

can anyone please see into it ? Why the 16 GB gpu keeps feeling after loading every thing on cuda device

ptrblck · April 24, 2021, 7:53am

The memory footprint of your training seems to be too high for 16GB so you could reduce the batch size, use a smaller model, or use e.g. torch.utils.checkpoint to trade compute for memory.