I am facing issues with getting a free port in the DDP setup block of PyTorch for parallelizing my deep learning training job across multiple GPUs on a Linux HPC cluster.
I am trying to submit a deep learning training job to a Linux HPC cluster using a SLURM script. The training requires parallelization of the model and data across 4 GPUs. I am using the Distributed Data Parallel (DDP) module of PyTorch for distribution of the model and data across 4 GPUs. I have defined the DDP setup block as follows:
def ddp_setup(rank, world_size):
"""
Args:
rank: Unique identifier of each process
world_size: Total number of processes
"""
try:
# Find a free port dynamically
def find_free_port():
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
s.bind(('', 0))
s.listen(1)
port = s.getsockname()[1]
return port
# Destroy any existing process groups
if dist.is_initialized():
dist.destroy_process_group()
# Set environment variables with a dynamic port
os.environ['MASTER_ADDR'] = 'localhost'
master_port = find_free_port()
os.environ['MASTER_PORT'] = str(master_port)
# Initialize process group with additional error handling
init_process_group(
backend="nccl",
rank=rank,
world_size=world_size,
timeout=timedelta(minutes=60),
init_method='env://'
)
# Set the GPU device
torch.cuda.set_device(rank)
print(f"Process group initialized successfully for rank {rank}")
except RuntimeError as e:
print(f"Error initializing process group: {e}")
# Provide more specific error handling
if "Address already in use" in str(e):
print("Port is already in use. Ensure no previous processes are running.")
raise
except Exception as e:
print(f"Unexpected error in ddp_setup: {e}")
raise
But the issue which I am getting each time while running the code is that, the allotted port is showing as “already in use” whatever may be its number. I have attached the error block below:
[W socket.cpp:426] [c10d] The server socket has failed to listen on [::]:12356 (errno: 98 - Address already in use).
[W socket.cpp:426] [c10d] The server socket has failed to bind to 0.0.0.0:12356 (errno: 98 - Address already in use).
[E socket.cpp:462] [c10d] The server socket has failed to listen on any local network address. Traceback (most recent call last): File "/scratch/j20240138/final_rdnet.py", line 401, in mp.spawn(main, args=(world_size, args.save_every, args.total_epochs, args.batch_size), nprocs=world_size), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 239, in spawn return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes while not context.join(): ^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 160, in join raise ProcessRaisedException(msg, error_index, failed_process.pid) torch.multiprocessing.spawn.ProcessRaisedException: -- Process 0 terminated with the following error: Traceback (most recent call last): File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap fn(i, *args) File "/scratch/j20240138/final_rdnet.py", line 369, in main ddp_setup(rank, world_size) File "/scratch/j20240138/final_rdnet.py", line 195, in ddp_setup init_process_group(backend="nccl", rank=rank, world_size=world_size, timeout=timedelta(minutes=60)) File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py", line 900, in init_process_group store, rank, world_size = next(rendezvous_iterator) ^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/distributed/rendezvous.py", line 245, in _env_rendezvous_handler store = _create_c10d_store(master_addr, master_port, rank, world_size, timeout) ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/distributed/rendezvous.py", line 176, in _create_c10d_store return TCPStore( ^^^^^^^^^ RuntimeError: The server socket has failed to listen on any local network address. The server socket has failed to listen on [::]:12356 (errno: 98 - Address already in use). The server socket has failed to bind to 0.0.0.0:12356 (errno: 98 - Address already in use). srun: error: gpu010: task 0: Exited with exit code 1 Traceback (most recent call last): File "/scratch/j20240138/final_rdnet.py", line 401, in mp.spawn(main, args=(world_size, args.save_every, args.total_epochs, args.batch_size), nprocs=world_size), ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 239, in spawn return start_processes(fn, args, nprocs, join, daemon, start_method='spawn') ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 197, in start_processes while not context.join(): ^^^^^^^^^^^^^^
File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 160, in join raise ProcessRaisedException(msg, error_index, failed_process.pid) torch.multiprocessing.spawn.ProcessRaisedException: -- Process 0 terminated with the following error: Traceback (most recent call last): File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap fn(i, *args) File "/scratch/j20240138/final_rdnet.py", line 369, in main ddp_setup(rank, world_size) File "/scratch/j20240138/final_rdnet.py", line 195, in ddp_setup init_process_group(backend="nccl", rank=rank, world_size=world_size, timeout=timedelta(minutes=60)) File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py", line 932, in init_process_group _store_based_barrier(rank, store, timeout) File "/home/j20240138/miniconda3/envs/projenv/lib/python3.11/site-packages/torch/distributed/distributed_c10d.py", line 469, in _store_based_barrier raise RuntimeError( RuntimeError: Timed out initializing process group in store based barrier on rank: 0, for key: store_based_barrier_key:1 (world_size=2, worker_count=3, timeout=1:00:00) srun: error: gpu010: task 1: Exited with exit code 1
I need help in resolving this issue.