Much better convergence and testing results when validation block is removed

Hi everyone,

I am facing a weird problem. There is something wrong with my validation block that I am unable to figure out. The convergence is much slower and converges to a much higher metric than if I remove the validation block entirely. Anyone has any idea why?

def main():

args = parse_args()
cfg.set_args(args.gpu_ids, args.stage, args.continue_train)
cudnn.benchmark = True
trainer = Trainer()
trainer._make_batch_generator()
trainer._make_model()
validation = Validation()
validation._make_batch_generator()

writer = SummaryWriter()

# train
for epoch in range(trainer.start_epoch, cfg.end_epoch):

    trainer.set_lr(epoch)
    trainer.tot_timer.tic()
    trainer.read_timer.tic()

    for itr, (inputs, targets, meta_info) in enumerate(trainer.batch_generator):
        trainer.read_timer.toc()
        trainer.gpu_timer.tic()

        # forward
        trainer.optimizer.zero_grad()
        loss = trainer.model(inputs, targets, meta_info, 'train')
        loss = {k:loss[k].mean() for k in loss}

        # backward
        sum(loss[k] for k in loss).backward()
        trainer.optimizer.step()
        trainer.gpu_timer.toc()
        screen = [
            'Epoch %d/%d itr %d/%d:' % (epoch, cfg.end_epoch, itr, trainer.itr_per_epoch),
            'lr: %g' % (trainer.get_lr()),
            'speed: %.2f(%.2fs r%.2f)s/itr' % (
                trainer.tot_timer.average_time, trainer.gpu_timer.average_time, trainer.read_timer.average_time),
            '%.2fh/epoch' % (trainer.tot_timer.average_time / 3600. * trainer.itr_per_epoch),
            ]
        screen += ['%s: %.4f' % ('training_loss_' + k, v.detach()) for k,v in loss.items()]
        trainer.logger.info(' '.join(screen))
        if itr%10 == 0:
            for k,v in loss.items():
                writer.add_scalar('training_loss_' + k, v.detach(), (trainer.itr_per_epoch)*(epoch)+itr)

        if itr%100 == 0:
            eval_result = {}
            
            cur_sample_idx = 0
            trainer.model.eval()
            
            with torch.no_grad():
                for itr_val, (inputs, targets, meta_info) in enumerate(validation.batch_generator):
                    valid_out = trainer.model(inputs, targets, meta_info, 'test')

                    # save output
                    valid_out = {k: v.cpu().numpy() for k,v in valid_out.items()}
                    for k,v in valid_out.items(): batch_size = valid_out[k].shape[0]
                    valid_out = [{k: v[bid] for k,v in valid_out.items()} for bid in range(batch_size)]

                    # evaluate
                    cur_eval_result = validation._evaluate(valid_out, cur_sample_idx)
                    for k,v in cur_eval_result.items():
                        if k in eval_result: eval_result[k] += v
                        else: eval_result[k] = v
                    cur_sample_idx += len(valid_out)
    
                validation._print_eval_result(eval_result)
                screen = []
                #import ipdb; ipdb.set_trace()
                screen += ['%s: %.4f' % ('validation_' + k, np.mean(v)) for k,v in eval_result.items()]
                validation.logger.info(' '.join(screen))
                for k,v in eval_result.items():
                    writer.add_scalar('validation_' + k, np.mean(v), (trainer.itr_per_epoch)*(epoch)+itr)

            trainer.model.train()

        trainer.tot_timer.toc()
        trainer.tot_timer.tic()
        trainer.read_timer.tic()

    if epoch%20 == 0 or epoch == (cfg.end_epoch - 1):
        trainer.save_model({
            'epoch': epoch,
            'network': trainer.model.state_dict(),
            'optimizer': trainer.optimizer.state_dict(),
        }, epoch)

writer.flush()
writer.close()

I would expect a slowdown in the script, if the validation loop is used since it has to be executed.
The latter issue might point towards a data leak. Did you make sure that the training and validation datasets are separate?
I’m also unsure which package you are using with the Trainer class, so I cannot comment on its internals and if they might cause some issues.

Hi @ptrblck. Thank you for the prompt response. My validation dataset is the same as testset currently as I am not tuning any hyperparameters using it. Also, I don’t understand what you mean by the package with the trainer class. I am not using anything different than what is normally used if that helps. Also, the Trainer class is a bit messy and hard to understand. Hence, haven’t posted it here.