Hi, I’m in trouble with distributed.barrier()
, I use this to let other ranks wait for rank0 to do test and save parm, when using DDP training all rank share param so I think no need to use all ranks to do test and save.
Some code here.
distributed.barrier() # first barrier
for epoch in range(resume_epoch, epochs):
tic = time.time()
if not cfg.data.transform.dali_pipe:
train_sampler.set_epoch(epoch)
train_one_epoch(model, train_loader, Loss, optimizer, epoch, lr_scheduler, logger, (top1_acc, loss_record, *train_dc),
scaler, gpu, args, cfg)
if is_first_rank:
one_epoch_time_cost = int(time.time() - tic)
train_speed = cfg.data.num_training_samples // one_epoch_time_cost
train_time_cost = "%02d:%02d:%02d" % seconds_to_time(one_epoch_time_cost)
logger.info(f'Finish one epoch cost {train_time_cost}, speed: {train_speed} samples/s.')
if not cfg.test.no_test:
test(model, val_loader, Loss, epoch, logger, (top1_acc, top5_acc, loss_record), gpu)
acc = top1_acc.get()
checkpoint = {
'model': model.state_dict(),
'optimizer': optimizer.state_dict(),
'scaler': scaler.state_dict(),
'lr_scheduler': lr_scheduler.state_dict(),
}
torch.save(checkpoint, '{}/{}_{}_{:.5}.pt'.format(args.out_path, cfg.model.network, epoch, acc))
if acc > best_top1_acc:
old_backbone = '{}/{}_backbone_{:.5}.pth'.format(args.out_path, cfg.model.network, best_top1_acc)
if os.path.exists(old_backbone):
os.remove(old_backbone)
best_top1_acc = acc
torch.save(checkpoint['model'], '{}/{}_backbone_{:.5}.pth'.format(args.out_path, cfg.model.network, acc))
if cfg.data.transform.dali_pipe.enable:
train_loader.reset()
logger.info(f"rank:{gpu} got here.")
distributed.barrier()
logger.info(f"rank:{gpu} pass here.")
My issue is all rank could pass first barrier, and all rank could get second barrier but no one pass it.
Could you please give me some advice?