If I do it like this, spawn and after the init_process_group:
if __name__ == "__main__":
mp.spawn(train, nprocs=1, args=(args, ))
setup(rank = args.rank, world_size= args.nodes, args= args)
I have the following error:
Traceback (most recent call last):
line 233, in <module>
mp.spawn(train, nprocs=1, args=(args, ))
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
while not context.join():
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 160, in join
raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException:
-- Process 0 terminated with the following error:
Traceback (most recent call last):
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
fn(i, *args)
File "...........path/train_ddp_example.py", line 70, in train
sampler_train, train_loader = load_dataset(args)
File "...........path/train_ddp_example.py", line 54, in load_dataset
sampler_train = DistributedSampler(train_set)
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/utils/data/distributed.py", line 67, in __init__
num_replicas = dist.get_world_size()
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 867, in get_world_size
return _get_group_size(group)
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 325, in _get_group_size
default_pg = _get_default_group()
File "/home/user/anaconda3/envs/p3m/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 429, in _get_default_group
raise RuntimeError(
RuntimeError: Default process group has not been initialized, please make sure to call init_process_group.
Any ideas?