I have very simple script:
def setup():
if (torch.distributed.is_available() is False):
print("Distributed not available")
return
print(f"Master: {os.environ['MASTER_ADDR']}:{os.environ['MASTER_PORT']}")
# Environment variables set by torch.distributed.launch or torchrun
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
world_rank = int(os.environ['RANK'])
print(f"Init... local_rank={local_rank}, world_size={world_size}, world_rank={world_rank}")
# initialize the process group
dist.init_process_group("gloo",
init_method="tcp://MyLocalPc:777",
rank=world_rank,
world_size=world_size)
print(
f"[{os.getpid()}] " +
f"{os.environ} " +
f"world_size = {dist.get_world_size()}, " +
f"rank = {dist.get_rank()}, " +
f"backend={dist.get_backend()}"
)
if __name__=="__main__":
setup()
I run it on master as:
torchrun --nproc_per_node=1 --nnodes=2 --node_rank=0 --master_addr=MyLocalPc --master_port=777 main.py
And on host
torchrun --nproc_per_node=1 --nnodes=2 --node_rank=1 --master_addr=MyLocalPc --master_port=777 main.py
On host I have added MyLocalPc IP to /etc/hosts
However, host crashes with this:
NOTE: Redirects are currently not supported in Windows or MacOs.
Master: MyLocalPc:777
Init... local_rank=0, world_size=2, world_rank=1
libc++abi: terminating due to uncaught exception of type gloo::EnforceNotMet: [enforce fail at /Users/runner/work/pytorch/pytorch/pytorch/third_party/gloo/gloo/transport/uv/pair.cc:549] state_ != CONNECTING. 1 vs 1. Cannot close pair while waiting on connection
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -6) local_rank: 0 (pid: 5387) of binary: /Users/user/miniconda3/envs/nnframework/bin/python
Traceback (most recent call last):
File "/Users/user/miniconda3/envs/nnframework/bin/torchrun", line 8, in <module>
sys.exit(main())
^^^^^^
File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
^^^^^^^^^^^^^^^^^^
File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/Users/user/miniconda3/envs/nnframework/lib/python3.11/site-packages/torch/distributed/launcher/api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
main.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-07-06_14:31:20
host : 1.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.0.ip6.arpa
rank : 1 (local_rank: 0)
exitcode : -6 (pid: 5387)
error_file: <N/A>
traceback : Signal 6 (SIGABRT) received by PID 5387
============================================================
And master ends with:
NOTE: Redirects are currently not supported in Windows or MacOs.
Master: MyLocalPc:777
Init... local_rank=0, world_size=2, world_rank=0
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 3221226505) local_rank: 0 (pid: 38008) of binary: E:\Python\conda_env_pytorch2\python.exe
Traceback (most recent call last):
File "E:\Python\conda_env_pytorch2\lib\runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "E:\Python\conda_env_pytorch2\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "E:\Python\conda_env_pytorch2\Scripts\torchrun.exe\__main__.py", line 7, in <module>
File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\elastic\multiprocessing\errors\__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\run.py", line 794, in main
run(args)
File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\run.py", line 785, in run
elastic_launch(
File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\launcher\api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "E:\Python\conda_env_pytorch2\lib\site-packages\torch\distributed\launcher\api.py", line 250, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
main.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-07-06_14:31:19
host : MyLocalPc
rank : 0 (local_rank: 0)
exitcode : 3221226505 (pid: 38008)
error_file: <N/A>
traceback : To enable traceback see: https://pytorch.org/docs/stable/elastic/errors.html
============================================================
What is going on?
I have PyTorch 2.0.1. Master is Windows machine, host is MacOS