Runtime error: connection reset by peer in init_process_group

Hey @Amit_Singh1, can you try if spawn mode works for you? I tried the following, and it works in my dev env.

import os
import torch.distributed as dist
import torch.multiprocessing as mp

def run(size, rank):
    print(size, rank)

def init_process(rank, size, fn, backend="gloo"):
    """Initialize the distributed environment."""
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29500"
    dist.init_process_group(backend, rank=rank, world_size=size)
    fn(rank, size)

if __name__ == "__main__":
    size = 4
    processes = []
    mp.set_start_method("spawn")
    for rank in range(size):
        p = mp.Process(target=init_process, args=(rank, size, run))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

Here are some best practices for multiprocessing.

1 Like