Hey @Amit_Singh1, can you try if spawn mode works for you? I tried the following, and it works in my dev env.
import os
import torch.distributed as dist
import torch.multiprocessing as mp
def run(size, rank):
print(size, rank)
def init_process(rank, size, fn, backend="gloo"):
"""Initialize the distributed environment."""
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "29500"
dist.init_process_group(backend, rank=rank, world_size=size)
fn(rank, size)
if __name__ == "__main__":
size = 4
processes = []
mp.set_start_method("spawn")
for rank in range(size):
p = mp.Process(target=init_process, args=(rank, size, run))
p.start()
processes.append(p)
for p in processes:
p.join()
Here are some best practices for multiprocessing.