I totally don’t understand what is wrong…
def setup(rank, world_size, seed):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# Initialize the process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
# Explicitly setting seed to make sure that models created in two processes
# start from same random weights and biases.
set_seed(seed)
def fn(rank, world_size):
setup(rank, world_size, 0)
print(rank)
cleanup()
def cleanup():
dist.destroy_process_group()
def run(fn, world_size):
mp.spawn(fn, args=(world_size,), nprocs=world_size, join=True)
if __name__ == '__main__':
run(fn, 4)
I got broken pipe…
Why…?
I use torch.multiprocess
for mp
and torch.distributed
for dist
.