Hi, yes sure. Thank you for your responce.
my source code is…
I send my source if it could be better show my error
This is data module
class DataModule(pl.LightningDataModule):
def __init__(self, train_dataset, val_dataset, batch_size = 1):
super(DataModule, self).__init__()
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size = batch_size
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size = self.batch_size, collate_fn = collate_fn, shuffle = True, num_workers = 0, pin_memory = True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size = self.batch_size,collate_fn = collate_fn, shuffle = False, num_workers = 0, pin_memory = True)
and this is part of model
class LaTrForVQA(pl.LightningModule):
def __init__(self, config , learning_rate = 1e-4, max_steps = 100000//2):
super(LaTrForVQA, self).__init__()
self.config = config
self.save_hyperparameters()
self.latr = LaTr_for_finetuning(config)
self.training_losses = []
self.validation_losses = []
self.max_steps = max_steps
def configure_optimizers(self):
return torch.optim.AdamW(self.parameters(), lr = self.hparams['learning_rate'])
def forward(self, batch_dict):
boxes = batch_dict['boxes']
img = batch_dict['img']
question = batch_dict['question']
words = batch_dict['tokenized_words']
answer_vector = self.latr(lang_vect = words,
spatial_vect = boxes,
img_vect = img,
quest_vect = question
)
return answer_vector
def calculate_metrics(self, prediction, labels):
## Calculate the accuracy score between the prediction and ground label for a batch, with considering the pad sequence
batch_size = len(prediction)
ac_score = 0
for (pred, gt) in zip(prediction, labels):
ac_score+= calculate_acc_score(pred.detach().cpu(), gt.detach().cpu())
ac_score = ac_score/batch_size
return ac_score
def training_step(self, batch, batch_idx):
answer_vector = self.forward(batch)
## https://discuss.huggingface.co/t/bertformaskedlm-s-loss-and-scores-how-the-loss-is-computed/607/2
loss = nn.CrossEntropyLoss()(answer_vector.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(answer_vector, dim = -1)
## Calculating the accuracy score
train_acc = self.calculate_metrics(preds, batch['answer'])
train_acc = torch.tensor(train_acc)
## Logging
self.log('train_ce_loss', loss,prog_bar = True)
self.log('train_acc', train_acc, prog_bar = True)
self.training_losses.append(loss.item())
return loss
def validation_step(self, batch, batch_idx):
logits = self.forward(batch)
loss = nn.CrossEntropyLoss()(logits.reshape(-1,self.config['classes']), batch['answer'].reshape(-1))
_, preds = torch.max(logits, dim = -1)
## Validation Accuracy
val_acc = self.calculate_metrics(preds.cpu(), batch['answer'].cpu())
val_acc = torch.tensor(val_acc)
## Logging
self.log('val_ce_loss', loss, prog_bar = True)
self.log('val_acc', val_acc, prog_bar = True)
return {'val_loss': loss, 'val_acc': val_acc}
def optimizer_step(self, epoch_nb, batch_nb, optimizer, optimizer_i, opt_closure = None, on_tpu=False,
using_native_amp=False, using_lbfgs=False):
if self.trainer.global_step < 1000:
lr_scale = min(1., float(self.trainer.global_step + 1) / 1000.)
for pg in optimizer.param_groups:
pg['lr'] = lr_scale * self.hparams.learning_rate
else:
for pg in optimizer.param_groups:
pg['lr'] = polynomial(self.hparams.learning_rate, self.trainer.global_step, max_iter = self.max_steps)
optimizer.step(opt_closure)
optimizer.zero_grad()
def validation_epoch_end(self, outputs):
val_loss = torch.stack([x['val_loss'] for x in outputs]).mean()
val_acc = torch.stack([x['val_acc'] for x in outputs]).mean()
self.log('val_loss_epoch_end', val_loss, on_epoch=True, sync_dist=True)
self.log('val_acc_epoch_end', val_acc, on_epoch=True, sync_dist=True)
self.val_prediction = []
model=LaTrForVQA(config)
and this is for trainer
trainer = pl.Trainer(
max_steps = max_steps,
default_root_dir="runs",
gpus=2,
deterministic=True,
)
Now I fit my model
datamodule = DataModule(train_ds, val_ds)
trainer.fit(model,datamodule)
If you have any other questions, be sure to ask…
I change my code according some suggestion to don’t use parallel and only use pl.trainer(gpus=2) but with this code I still can’t use both of GPU in parallel and only one GPU work in source code and I dont know what other change I should apply in this source.