EnforceNotMet error

I have very simple script:


def setup():

    if (torch.distributed.is_available() is False):
        print("Distributed not available")
        return

    print(f"Master: {os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}")

    # Environment variables set by torch.distributed.launch or torchrun
    local_rank = int(os.environ['LOCAL_RANK'])
    world_size = int(os.environ['WORLD_SIZE'])
    world_rank = int(os.environ['RANK'])
    
    print(f"Init... local_rank={local_rank}, world_size={world_size}, world_rank={world_rank}")

    # initialize the process group
    dist.init_process_group("gloo",                             
                            init_method="tcp://MyLocalPc:777",
                            rank=world_rank, 
                            world_size=world_size)

    print(
        f"[{os.getpid()}] " + 
        f"{os.environ} " +
        f"world_size = {dist.get_world_size()}, " +
        f"rank = {dist.get_rank()}, " + 
        f"backend={dist.get_backend()}"
    )

if __name__=="__main__":
    setup()

I run it on master as:

torchrun --nproc_per_node=1 --nnodes=2 --node_rank=0 --master_addr=MyLocalPc --master_port=777 main.py

And on host

torchrun --nproc_per_node=1 --nnodes=2 --node_rank=1 --master_addr=MyLocalPc --master_port=777 main.py

On host I have added MyLocalPc IP to /etc/hosts

However, host crashes with this:

NOTE: Redirects are currently not supported in Windows or MacOs.
Master: MyLocalPc:777
Init... local_rank=0, world_size=2, world_rank=1
libc++abi: terminating due to uncaught exception of type gloo::EnforceNotMet: [enforce fail at /Users/runner/work/pytorch/pytorch/pytorch/third_party/gloo/gloo/transport/uv/pair.cc:549] state_ != CONNECTING. 1 vs 1. Cannot close pair while waiting on connection
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 5387) of binary: /Users/user/miniconda3/envs/nnframework/bin/python
Traceback (most recent call last):
  File "/Users/user/miniconda3/envs/nnframework/bin/torchrun", line 8, in <module>
    sys.exit(main())
             ^^^^^^
  File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^
  File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main
    run(args)
  File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run
    elastic_launch(
  File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
main.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-07-06_14:31:20
  host      : 1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa
  rank      : 1 (local_rank: 0)
  exitcode  : -6 (pid: 5387)
  error_file: <N/A>
  traceback : Signal 6 (SIGABRT) received by PID 5387
============================================================

And master ends with:

NOTE: Redirects are currently not supported in Windows or MacOs.
Master: MyLocalPc:777
Init... local_rank=0, world_size=2, world_rank=0
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 3221226505) local_rank: 0 (pid: 38008) of binary: E:\Python\conda_env_pytorch2\python.exe
Traceback (most recent call last):
  File "E:\Python\conda_env_pytorch2\lib\runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "E:\Python\conda_env_pytorch2\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "E:\Python\conda_env_pytorch2\Scripts\torchrun.exe\__main__.py", line 7, in <module>
  File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\elastic\multiprocessing\errors\__init__.py", line 346, in wrapper
    return f(*args, **kwargs)
  File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\run.py", line 794, in main
    run(args)
  File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\run.py", line 785, in run
    elastic_launch(
  File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\launcher\api.py", line 134, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\launcher\api.py", line 250, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
main.py FAILED
------------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2023-07-06_14:31:19
  host      : MyLocalPc
  rank      : 0 (local_rank: 0)
  exitcode  : 3221226505 (pid: 38008)
  error_file: <N/A>
  traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================

What is going on?

I have PyTorch 2.0.1. Master is Windows machine, host is MacOS