Hello all,
I am fine tuning T5 Conditional Generation model with squad dataset on kaggle with gpu.
But i am getting this cuda out of memory error for the following code
class SquadQAModel(pl.LightningModule):
def __init__(self):
super().__init__()
self.model = Model
def forward(self, input_ids, attention_mask, labels=None):
output = self.model(
input_ids,
attention_mask=attention_mask,
labels=labels)
return output.loss, output.logits
def training_step(self, batch, batch_idx):
input_ids = batch['input_ids']
attention_mask=batch['attention_mask']
labels = batch['labels']
loss, outputs = self.forward(input_ids, attention_mask, labels)
self.log("train_loss", loss, prog_bar=True, logger=True)
return {"loss": loss, "predictions":outputs, "labels": labels}
def validation_step(self, batch, batch_idx):
input_ids = batch['input_ids']
attention_mask=batch['attention_mask']
labels = batch['labels']
loss, outputs = self.forward(input_ids, attention_mask, labels)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss
def test_step(self, batch, batch_idx):
input_ids = batch['input_ids']
attention_mask=batch['attention_mask']
labels = batch['labels']
loss, outputs = self.forward(input_ids, attention_mask, labels)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss
def configure_optimizers(self):
optimizer = AdamW(self.model.parameters(), lr=0.0001)
return optimizer
def training_epoch_end(self, outputs):
print("Epoch {} =======>>>> {}".format(self.trainer.current_epoch, self.trainer.progress_bar_dict ))
def validation_epoch_end(self, outputs):
print('EPOCH {} ======={}>>>>'.format(self.trainer.current_epoch, self.trainer.callback_metrics))
can anyone please see into it ? Why the 16 GB gpu keeps feeling after loading every thing on cuda device