I suspect there is an issue and the training is somehow picking up my validation set. Here is my sample code.
I have modified the code from here: https://github.com/HLTCHKUST/Xpersona/blob/master/multilingual/train_decoder_only.py
def update(engine, batch):
model.train()
batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS)
input_ids, lm_labels, token_type_ids = batch
lm_loss, prediction_scores, *_ = model(input_ids = input_ids, token_type_ids= token_type_ids, lm_labels = lm_labels)
#batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
loss = (lm_loss) / args.gradient_accumulation_steps
if args.fp16:
with amp.scale_loss(loss, optimizer) as scaled_loss:
scaled_loss.backward()
print("BACKWARD CALLED fp16")
torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
else:
print("BACKWARD CALLED")
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
if engine.state.iteration % args.gradient_accumulation_steps == 0:
print("Step called")
optimizer.step()
optimizer.zero_grad()
return loss.item()
trainer = Engine(update)
# Evaluation function and evaluator (evaluator output is the input of the metrics)
def inference(engine, batch):
model.eval()
with torch.no_grad():
batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS)
input_ids, lm_labels, token_type_ids = batch
logger.info(tokenizer.decode(input_ids[0, :].tolist()))
# if we dont send labels to model, it doesnt return losses
lm_logits, *_ = model(input_ids = input_ids, token_type_ids= token_type_ids)
lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, )
evaluator = Engine(inference)
The way I use them is as follows:
# Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
if args.n_epochs < 1:
trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
if args.eval_before_start:
trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))
# Make sure distributed data samplers split the dataset nicely between the distributed processes
if args.distributed:
trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))
# Linearly decrease the learning rate from lr to zero
scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)
# Prepare metrics - note how we compute distributed metrics
RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0]))}
metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)})
metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
for name, metric in metrics.items():
metric.attach(evaluator, name)
# On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
if args.local_rank in [-1, 0]:
pbar = ProgressBar(persist=True)
pbar.attach(trainer, metric_names=["loss"])
evaluator.add_event_handler(Events.COMPLETED,
lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))
log_dir = make_logdir(args.model_checkpoint)
log_dir += "_decoder_only"
if args.no_lang_id:
log_dir += "_noid"
tb_logger = TensorboardLogger(log_dir)
tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]),
event_name=Events.ITERATION_COMPLETED)
tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys())),event_name=Events.EPOCH_COMPLETED)
tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED)
checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {
'mymodel': getattr(model, 'module', model)}) # "getattr" takes care of distributed encapsulation
torch.save(args, log_dir + '/model_training_args.bin')
getattr(model, 'module', model).config.to_json_file(
os.path.join(log_dir, CONFIG_NAME)) # the config for encoder and decoder should be the same
tokenizer.save_pretrained(log_dir)
# Run the training
trainer.run(train_loader, max_epochs=args.n_epochs)
Now my dataset length is:
INFO:./train_decoder_only.py:Train dataset length: 112449
INFO:./train_decoder_only.py:Valid dataset length: 28113
However in the training loop:
INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
[1/28113] 0%| [00:00<?]
Epoch: [1/28113] 0%| [00:00<00:01]
Epoch: [1/28113] 0%| , loss=2.61 [00:00<00:09]
Epoch: [1/28113] 0%| , loss=2.61 [00:00<1:08:23]
Epoch: [1/28113] 0%| , loss=2.62 [00:00<1:08:35]
Epoch: [2/28113] 0%| , loss=2.62 [00:00<1:08:37]
Epoch: [2/28113] 0%| , loss=2.62 [00:00<1:08:37]
Epoch: [2/28113] 0%| , loss=2.63 [00:00<1:08:37]
Epoch: [3/28113] 0%| , loss=2.63 [00:00<1:05:06]
Epoch: [3/28113] 0%| , loss=2.63 [00:00<1:05:06]
I do not understand what all of my training set is picked or used for training. IS there a way to fix this?