I am trying to perform distributed training with 2 nodes, each node with 1 GPU (3090 RTX).
However, I am getting stuck at this error.
- When i treat node 1 as master node and run the torch run command in node 1
torchrun --nproc_per_node=1 --nnodes=2 --node_rank=0 --rdzv_id=619 --rdzv_backend=c10d --rdzv_endpoint=10.165.179.57:7861 trainer.py -cfg configs/workernode.yaml
torch version : 1.13.1+cu117
The following is the error from node2 after using torchrun command (i update node=1 before executing), in case of node 1 console, it just waits until i interrupt with keyboard.
Traceback (most recent call last):
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 241, in launch_agent
result = agent.run()
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 723, in run
result = self._invoke_run(role)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 858, in _invoke_run
self._initialize_workers(self._worker_group)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 692, in _initialize_workers
self._rendezvous(worker_group)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 546, in _rendezvous
store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1027, in next_rendezvous
self._op_executor.run(exit_op, deadline)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 607, in run
has_set = self._state_holder.sync()
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 415, in sync
self._state = pickle.loads(state_bits)
File "<string>", line 3, in __hash__
AttributeError: '_NodeDesc' object has no attribute 'addr'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/vjj2kor/miniconda3/envs/ddp/bin/torchrun", line 8, in <module>
sys.exit(main())
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/run.py", line 794, in main
run(args)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/run.py", line 785, in run
elastic_launch(
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 134, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 270, in launch_agent
spec.rdzv_handler.shutdown()
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1103, in shutdown
self._close()
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1127, in _close
self._op_executor.run(op, deadline)
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 607, in run
has_set = self._state_holder.sync()
File "/home/vjj2kor/miniconda3/envs/ddp/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 415, in sync
self._state = pickle.loads(state_bits)
File "<string>", line 3, in __hash__
AttributeError: '_NodeDesc' object has no attribute 'addr'
Similarly, when i treat node 2 as master node and repeat the process, this is the error from node1 console.
torch version : 2.0.0+cu117
Traceback (most recent call last):
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 237, in launch_agent
result = agent.run()
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 709, in run
result = self._invoke_run(role)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 844, in _invoke_run
self._initialize_workers(self._worker_group)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 678, in _initialize_workers
self._rendezvous(worker_group)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/metrics/api.py", line 129, in wrapper
result = f(*args, **kwargs)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/agent/server/api.py", line 538, in _rendezvous
store, group_rank, group_world_size = spec.rdzv_handler.next_rendezvous()
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1024, in next_rendezvous
self._op_executor.run(exit_op, deadline)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 607, in run
has_set = self._state_holder.sync()
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 415, in sync
self._state = pickle.loads(state_bits)
File "<string>", line 3, in __hash__
AttributeError: '_NodeDesc' object has no attribute 'fqdn'
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/vjj2kor/miniconda3/envs/ddp/bin/torchrun", line 33, in <module>
sys.exit(load_entry_point('torch==1.12.1', 'console_scripts', 'torchrun')())
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/run.py", line 762, in main
run(args)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/launcher/api.py", line 266, in launch_agent
spec.rdzv_handler.shutdown()
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1100, in shutdown
self._close()
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 1124, in _close
self._op_executor.run(op, deadline)
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 607, in run
has_set = self._state_holder.sync()
File "/home/vjj2kor/.local/lib/python3.8/site-packages/torch/distributed/elastic/rendezvous/dynamic_rendezvous.py", line 415, in sync
self._state = pickle.loads(state_bits)
File "<string>", line 3, in __hash__
AttributeError: '_NodeDesc' object has no attribute 'fqdn'