Pytorch-ignite Training not happening on training set

VikasRajashekar · November 20, 2020, 1:28am

I suspect there is an issue and the training is somehow picking up my validation set. Here is my sample code.

I have modified the code from here: https://github.com/HLTCHKUST/Xpersona/blob/master/multilingual/train_decoder_only.py

    def update(engine, batch):
        model.train()
        batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS)
        input_ids, lm_labels, token_type_ids = batch
        lm_loss, prediction_scores, *_ = model(input_ids = input_ids, token_type_ids= token_type_ids, lm_labels = lm_labels)
        #batch = tuple(input_tensor.to(args.device) for input_tensor in batch)
        loss = (lm_loss) / args.gradient_accumulation_steps
        if args.fp16:
            with amp.scale_loss(loss, optimizer) as scaled_loss:
                scaled_loss.backward()
                print("BACKWARD CALLED fp16")
            torch.nn.utils.clip_grad_norm_(amp.master_params(optimizer), args.max_norm)
        else:
            print("BACKWARD CALLED")
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), args.max_norm)
        if engine.state.iteration % args.gradient_accumulation_steps == 0:
            print("Step called")
            optimizer.step()
            optimizer.zero_grad()
        return loss.item()
    trainer = Engine(update)

    # Evaluation function and evaluator (evaluator output is the input of the metrics)
    def inference(engine, batch):
        model.eval()
        with torch.no_grad():
            batch = tuple(batch[input_name].to(args.device) for input_name in MODEL_INPUTS)
            input_ids, lm_labels, token_type_ids = batch
            logger.info(tokenizer.decode(input_ids[0, :].tolist()))
            # if we dont send labels to model, it doesnt return losses
            lm_logits, *_ = model(input_ids = input_ids, token_type_ids= token_type_ids)
            lm_logits_flat_shifted = lm_logits[..., :-1, :].contiguous().view(-1, lm_logits.size(-1))
            lm_labels_flat_shifted = lm_labels[..., 1:].contiguous().view(-1)
            return (lm_logits_flat_shifted, ), (lm_labels_flat_shifted, )
    evaluator = Engine(inference)

The way I use them is as follows:

    # Attach evaluation to trainer: we evaluate when we start the training and at the end of each epoch
    trainer.add_event_handler(Events.EPOCH_COMPLETED, lambda _: evaluator.run(val_loader))
    if args.n_epochs < 1:
        trainer.add_event_handler(Events.COMPLETED, lambda _: evaluator.run(val_loader))
    if args.eval_before_start:
        trainer.add_event_handler(Events.STARTED, lambda _: evaluator.run(val_loader))

    # Make sure distributed data samplers split the dataset nicely between the distributed processes
    if args.distributed:
        trainer.add_event_handler(Events.EPOCH_STARTED, lambda engine: train_sampler.set_epoch(engine.state.epoch))
        evaluator.add_event_handler(Events.EPOCH_STARTED, lambda engine: valid_sampler.set_epoch(engine.state.epoch))

    # Linearly decrease the learning rate from lr to zero
    scheduler = PiecewiseLinear(optimizer, "lr", [(0, args.lr), (args.n_epochs * len(train_loader), 0.0)])
    trainer.add_event_handler(Events.ITERATION_STARTED, scheduler)

    # Prepare metrics - note how we compute distributed metrics
    RunningAverage(output_transform=lambda x: x).attach(trainer, "loss")
    metrics = {"nll": Loss(torch.nn.CrossEntropyLoss(ignore_index=-1), output_transform=lambda x: (x[0][0], x[1][0]))}
    metrics.update({"average_nll": MetricsLambda(average_distributed_scalar, metrics["nll"], args)})
    metrics["average_ppl"] = MetricsLambda(math.exp, metrics["average_nll"])
    for name, metric in metrics.items():
        metric.attach(evaluator, name)

    # On the main process: add progress bar, tensorboard, checkpoints and save model, configuration and tokenizer before we start to train
    if args.local_rank in [-1, 0]:
        pbar = ProgressBar(persist=True)
        pbar.attach(trainer, metric_names=["loss"])
        evaluator.add_event_handler(Events.COMPLETED,
                                    lambda _: pbar.log_message("Validation: %s" % pformat(evaluator.state.metrics)))

        log_dir = make_logdir(args.model_checkpoint)
        log_dir += "_decoder_only"
        if args.no_lang_id:
            log_dir += "_noid"
        tb_logger = TensorboardLogger(log_dir)

        tb_logger.attach(trainer, log_handler=OutputHandler(tag="training", metric_names=["loss"]),
                         event_name=Events.ITERATION_COMPLETED)
        tb_logger.attach(trainer, log_handler=OptimizerParamsHandler(optimizer), event_name=Events.ITERATION_STARTED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys())),event_name=Events.EPOCH_COMPLETED)
        tb_logger.attach(evaluator, log_handler=OutputHandler(tag="validation", metric_names=list(metrics.keys()), global_step_transform=global_step_from_engine(trainer)), event_name=Events.EPOCH_COMPLETED)

        checkpoint_handler = ModelCheckpoint(log_dir, 'checkpoint', save_interval=1, n_saved=3)
        trainer.add_event_handler(Events.EPOCH_COMPLETED, checkpoint_handler, {
            'mymodel': getattr(model, 'module', model)})  # "getattr" takes care of distributed encapsulation

        torch.save(args, log_dir + '/model_training_args.bin')
        getattr(model, 'module', model).config.to_json_file(
            os.path.join(log_dir, CONFIG_NAME))  # the config for encoder and decoder should be the same
        tokenizer.save_pretrained(log_dir)

    # Run the training
    trainer.run(train_loader, max_epochs=args.n_epochs)

Now my dataset length is:

INFO:./train_decoder_only.py:Train dataset length: 112449
INFO:./train_decoder_only.py:Valid dataset length: 28113

However in the training loop:

INFO:ignite.engine.engine.Engine:Engine run starting with max_epochs=1.
[1/28113] 0%| [00:00<?]
Epoch: [1/28113] 0%| [00:00<00:01]
Epoch: [1/28113] 0%| , loss=2.61 [00:00<00:09]
Epoch: [1/28113] 0%| , loss=2.61 [00:00<1:08:23]
Epoch: [1/28113] 0%| , loss=2.62 [00:00<1:08:35]
Epoch: [2/28113] 0%| , loss=2.62 [00:00<1:08:37]
Epoch: [2/28113] 0%| , loss=2.62 [00:00<1:08:37]
Epoch: [2/28113] 0%| , loss=2.63 [00:00<1:08:37]
Epoch: [3/28113] 0%| , loss=2.63 [00:00<1:05:06]
Epoch: [3/28113] 0%| , loss=2.63 [00:00<1:05:06]

I do not understand what all of my training set is picked or used for training. IS there a way to fix this?

vfdev-5 · November 20, 2020, 9:32am

@VikasRajashekar how do you set up train_loader ?

VikasRajashekar · November 20, 2020, 10:21am

Hi Here is my train loader:

from torch.utils.data import DataLoader, TensorDataset, Dataset
train_dataset = DatasetTrain(train)
    valid_dataset = DatasetTrain(valid)
    print(train_dataset.max_len, valid_dataset.max_len)
    train_sampler = torch.utils.data.distributed.DistributedSampler(train_dataset) if args.distributed else None
    valid_sampler = torch.utils.data.distributed.DistributedSampler(valid_dataset) if args.distributed else None
    train_loader = DataLoader(train_dataset, sampler=train_sampler, batch_size=args.train_batch_size,
                              shuffle=(not args.distributed), collate_fn=collate_fn)
    valid_loader = DataLoader(valid_dataset, sampler=valid_sampler, batch_size=args.valid_batch_size, shuffle=False,
                              collate_fn=collate_fn)

The variables train and valid are just lists of the length previously I mentioned in the question.

vfdev-5 · November 20, 2020, 10:34am

Thanks, what gives len(train_loader) once fully configured ?

VikasRajashekar · November 20, 2020, 10:55am

Hello Here is my DataSet class:

class DatasetTrain(Dataset):
    """Custom data.Dataset compatible with DataLoader."""
    def __init__(self, data):
        self.data = data
        self.dataset_len = len(self.data)
        self.max_len = max(len(x["input_ids"]) for x in self.data) 
    def __getitem__(self, index):
        """Returns one data pair (source and target)."""
        item = self.data[index]
        return item
    def __len__(self):
        return self.dataset_len

It is the length of the list I suppose.

vfdev-5 · November 20, 2020, 11:00am

Well, the issue you are seeing currently is that trainer logs the length of valid_loader instead of train_loader, right? I think, there can be a problem with what is assigned with which data. So, my question is, if you add

    # Run the training
    print("Debug: train loader length: ", len(train_loader))
    trainer.run(train_loader, max_epochs=args.n_epochs)

and execute the full code, which value do you see. Trainer should properly run on what is provided. I thinks, valid loader and train loader are mixed somewhere previously…

VikasRajashekar · November 20, 2020, 3:05pm

Hey thanks for the input.

I found the mistake in the understanding

INFO:./train_decoder_only.py:Train dataset length: 112449

Here we are printing the length of dataset:

len(train_dataset)

Where are

Epoch: [1/28113] 0%| [00:00<00:01]

is the length of the dataset loader with batch size 4. so 1122449/4=28113 which is, unfortunately, the length of validation dataset leading to all the confusion.

vfdev-5 · November 20, 2020, 3:29pm

Yeah, this can happen
So, great that you could find the origins of the confusion