I’m training a model using DDP on 4 GPUs and 32 vcpus.
I’m using DDP with torch.mp.spawn to do this, while using num_workers =0 the below code runs fine, it train the 3 models one after the other.
but when i run the same with num_workers = 4, the speed increase is 3.3x in the training for model1,
after the training of model1 completes (all the ranks reached the “training complete”), it gets stuck at the mp.spawn() fn and hence it is stuck and no training for model2 starts.
def multi_gpu_training(rank,model_class,mel_spec,world_size,name,path,metric_key,eval_mode,wandb_):
torch.multiprocessing.set_sharing_strategy('file_system') # too many files open error
os.environ['MASTER_ADDR']="localhost"
os.environ["MASTER_PORT"]="12335"
init_process_group(backend='nccl',rank=rank, world_size=world_size)
device=torch.device(rank)
print("rank",rank,device)
model = load_model(model_class).to(device)
model = nn.SyncBatchNorm.convert_sync_batchnorm(model)
model = DDP(model,device_ids=[rank],find_unused_parameters=False)
train_dataset = Duration_Dataset(train_file,config.f0_file,config.durations_file,config.xvectors_file,mel_spec=mel_spec)
train_dataset = DataLoader(train_dataset,pin_memory=True,persistent_workers=True,batch_size=config.batch_size,shuffle=False,collate_fn=batch_processing,num_workers=conf$
test_dataset = None
if rank == 0:
test_dataset = Duration_Dataset(val_file,config.f0_file,config.durations_file,config.xvectors_file,mel_spec=mel_spec)
test_dataset = DataLoader(test_dataset,batch_size=config.batch_size,shuffle=False,collate_fn=batch_processing,sampler=None)
train_model(train_dataset,model,device,name=name,path=path,metrics_key=metric_key,eval_mode=e
return "training complete"
def multi_gpu_process(model,mel_spec,name,path,metrics_key,eval_mode,wandb_):
#torch.multiprocessing.set_start_method('spawn')
world_size = torch.cuda.device_count()
print('World Size:',world_size)
mp.spawn(multi_gpu_training, args=(model,mel_spec,world_size,name,path,metrics_key,eval_mode,wandb_), nprocs=world_size)
print('out of the spawning')
if __name__=='__main__':
multi_gpu_process(model1,mel_spec,name,path,metrics_key,eval_mode,wandb_)
multi_gpu_process(model2,mel_spec,name,path,metrics_key,eval_mode,wandb_)
multi_gpu_process(model3,mel_spec,name,path,metrics_key,eval_mode,wandb_)