Script freezes after evaluation in distributed training

Hi,
I try to evaluate my model after each epoch on the main rank only.
Somehow it freezes after iterating though the complete validation set.
The GPUs are not working anymore but the memory will not get released either.

Here is a sketch on how my training routine looks like:

class Experiment():

    def __init__(self, config):        
        self.config = config

    def getModel(self):
        (...)
        return model

    def getDataClass(self):
        (...)
        return dataclass

    def getLossfunction(self):
        (...)
        return lossfunction

    def getOptimizer(self, model):
        (...)
        return optimizer, scheduler, 

    def run(self):
        """
        The main operator
        :return:
        """

        world_size = self.config.world_size
        
        mp.spawn(self.mainTrainLoop,
                 args=(world_size,),
                 nprocs=world_size,
                 join=True)

    def train(self, dl_train, optimizer, rank, model, lossfunction, logger, epoch):

        model.train()

        for batch_idx, batch in enumerate(dl_train):

            optimizer.zero_grad(set_to_none=True)
            
            x_i, y_i = batch
            x_i = x_i.to(rank)
            z_i = model(x_i)

            # calc loss and do step
            loss = lossfunction(z_i, y_i)
            loss.backward()
            optimizer.step()


            if rank == 0:

                # logging and eval
                if batch_idx % self.config.terminal_interval == 0:
                    # print to commandline here
                    pass

                if batch_idx % self.config.tensorboard_interval == 0:
                    # Some tensorboard stuff here
                    pass

            if rank == 0:
                self.current_iteration += 1

    def validate(self, rank, val_dataloader, tracker, model, logger):

        tracker.increment()
        model.eval()
        current_eval_cycle = tracker.n_steps - 1 # starts with one so set it to zero
        
        with torch.no_grad():
            for batch_idx, batch in enumerate(val_dataloader):
                
                x_i, y_i = batch
                x_i = x_i.to(rank)
                z_i = model(x_i)

                tracker.update(z_i,y_i)

        for key, val in tracker.compute_all().items():
            # write to tensorboard here

        # if we have a new best metric save the model
        # here use current_eval_cycle instead of tracker.n_steps to index the array
        best_res, which_epoch = tracker.best_metric(return_step=True)
        for key, epoch in which_epoch.items():
            if epoch == current_eval_cycle:
                # save model here if best
                pass

    def mainTrainLoop(self, rank, world_size):

        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '12355'

        # initialize the process group
        dist.init_process_group("gloo", rank=rank, world_size=world_size)
        

        # --------------------------------------------------
        # get datasets, samplers for each rank

        dataclass = self.getDataClass()
        ds_train = dataclass.get_dataset("train")
        ds_val = dataclass.get_dataset("val")

        sampler_train = DistributedSampler(ds_train,
                                           num_replicas=world_size,
                                           rank=rank,
                                           shuffle=False,
                                           drop_last=self.config.dataloader.drop_last_batch)

        dl_train = DataLoader(ds_train,
                              batch_size=self.config.dataloader.batchsize,
                              pin_memory=False,
                              num_workers=0,
                              drop_last=self.config.dataloader.drop_last_batch,
                              shuffle=False,
                              sampler=sampler_train)


        # val set works without sampler
        # since its only exectuted on rank 0
        dl_val = DataLoader(ds_val,
                            batch_size=self.config.dataloader.batchsize,
                            pin_memory=False,
                            num_workers=0,
                            drop_last=self.config.dataloader.drop_last_batch,
                            shuffle=False)
        
        # --------------------------------------------------
        # get model, optimizer and loss

        model = self.getModel().to(rank)
        
        # resnet 18 has batch norm layers so:
        model = torch.nn.SyncBatchNorm.convert_sync_batchnorm(model)

        model = DDP(model, device_ids=[rank])

        optimizer, sceduler = self.getOptimizer(model)
        lossfunction = self.getLossfunction().to(rank)

        # --------------------------------------------------
        # seeding

        torch.cuda.manual_seed(self.config.torchseed)
        np.random.seed(self.config.numpyseed)

        # --------------------------------------------------
        # model summary

        logger = None
        if rank == 0:
            logger = logging.getLogger("Experiment")

        # --------------------------------------------------
        # Metrics and Tensorboard init

        tracker = None
        if rank == 0:

            list_of_metrics = [MeanSquaredError(), MeanAbsoluteError()]
            metric_coll = MetricCollection([m.to(rank) for m in list_of_metrics])
            tracker = MetricTracker(metric_coll, maximize=[False, True, True, False, False])

            # setup tensorboard writer on rank 0
            # somehow its not working if I set it up in the __init__ function and
            # then only use it from within rank 0. 
            self.summary_writer = SummaryWriter(log_dir="/path/to/sth")


        # --------------------------------------------------
        # start training

        for epoch in range(1, self.config.max_epochs + 1):

            # if we are using DistributedSampler, we have to tell it which epoch this is
            dl_train.sampler.set_epoch(epoch)      

            self.train(dl_train,
                       optimizer,
                       rank,
                       model,
                       lossfunction,
                       logger,
                       epoch)


            # for rank 0 evalute with
            # model.module instead of model
            if rank == 0:

                self.validate(rank,
                              dl_val,
                              tracker,
                              model.module,
                              logger)


        if rank == 0:
            self.summary_writer.close()


if __name__ == '__main__':

    import hydra
    from omegaconf import DictConfig, OmegaConf

    @hydra.main(version_base = None, config_path="./configs/", config_name="config")
    def main(config : DictConfig) -> None:
        
        exp = Experiment(config)
        exp.run() 
        
    main()

Does anyone has a clue of what is going on here?

Also: I couldn’t find a good and complete example on multi-GPU training that includes all like training testing and saving (potentially logging) stuff. I would be super great-full about tips here!#

Thanks a lot!

Is it possible there is a sync point across all GPUs after the validation is complete? From the post above it seemed like we were able to iterate through the validation dataset and the freeze happened post iteration.
Given that you are running this on only 1 GPU this might be one possible reason. Can you trying running validation on all ranks just to see if it will work?