DistributedDataParallel Error when using two networks

Hi.

I am running the PPO algorithm for my RL project and I am trying to use DDP to speed up the training. However, when I coded up PPO, I did it with two networks: policy and value. On my first attempt, I got the error:

Traceback (most recent call last):
 File "<string>", line 1, in <module>
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/spawn.py", line 105, in spawn_main
   exitcode = _main(fd)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/spawn.py", line 114, in _main
   prepare(preparation_data)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/spawn.py", line 225, in prepare
   _fixup_main_from_path(data['init_main_from_path'])
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/spawn.py", line 277, in _fixup_main_from_path
   run_name="__mp_main__")
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/runpy.py", line 263, in run_path
   pkg_name=pkg_name, script_name=fname)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/runpy.py", line 96, in _run_module_code
   mod_name, mod_spec, pkg_name, script_name)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/runpy.py", line 85, in _run_code
   exec(code, run_globals)
 File "/zhome/dskinne3/knot_gpu.py", line 347, in <module>
   results_ppo = main()
 File "/zhome/dskinne3/knot_gpu.py", line 234, in main
   mp.spawn(ppo_main, nprocs=args.gpus, args=(args,))
 File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
   return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
 File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 189, in start_processes
   process.start()
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/process.py", line 112, in start
   self._popen = self._Popen(self)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/context.py", line 284, in _Popen
   return Popen(process_obj)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/popen_spawn_posix.py", line 32, in __init__
   super().__init__(process_obj)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/popen_fork.py", line 20, in __init__
   self._launch(process_obj)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/popen_spawn_posix.py", line 42, in _launch
   prep_data = spawn.get_preparation_data(process_obj._name)
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/spawn.py", line 143, in get_preparation_data
   _check_not_importing_main()
 File "/fslhome/dskinne3/.conda/envs/mamba_knot/lib/python3.7/multiprocessing/spawn.py", line 136, in _check_not_importing_main
   is not going to be frozen to produce an executable.''')
RuntimeError: 
       An attempt has been made to start a new process before the
       current process has finished its bootstrapping phase.

       This probably means that you are not using fork to start your
       child processes and you have forgotten to use the proper idiom
       in the main module:

           if __name__ == '__main__':
               freeze_support()
               ...

       The "freeze_support()" line can be omitted if the program
       is not going to be frozen to produce an executable.
Traceback (most recent call last):
 File "knot_gpu.py", line 347, in <module>
   results_ppo = main()
 File "knot_gpu.py", line 234, in main
   mp.spawn(ppo_main, nprocs=args.gpus, args=(args,))
 File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
   return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
 File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
   while not context.join():
 File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 154, in join
   exit_code=exitcode
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with exit code 1

As for the way I have my code formatted, I have the following main function (which I got from this tutorial):

def main():
  # Helper stuff to parse worldsize and rank and whatnot from commandline.
  parser = argparse.ArgumentParser()
  parser.add_argument('-n', '--nodes', default=1, type=int, metavar='N')
  parser.add_argument('-g', '--gpus', default=1, type=int, help='Number of gpus per node')
  parser.add_argument('-nr', '--nr', default=0, type=int, help='Ranking within the nodes')
  parser.add_argument('--epochs', default=2, type=int, metavar='N', help='Number of total epochs to run')
  args = parser.parse_args()

  args.world_size = args.gpus * args.nodes
  os.environ['MASTER_ADDR'] = "$(scontrol show job $SLURM_JOBID | awk -F= '/BatchHost/ {print $2}')"
  os.environ['MASTER_PORT'] = "12345"
 
  mp.spawn(ppo_main, nprocs=args.gpus, args=(args,))

the ppo_main function looks as follows:

def ppo_main(gpu, args): 
  # Calculate rank to get things done.
  rank = args.nr * args.gpus + gpu
 
  # This needs to be called in order to run data parallelization.
    # backend='nccl' is the necessary parameter when working with GPUs.
  torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:54263', world_size=args.world_size, rank=rank)
  # Hyper parameters
  lr = 1e-3
  epochs = 200
  env_samples = 200
  gamma = 0.9
  batch_size = 256
  epsilon = 0.2
  policy_epochs = 30

  # Init environment 
  state_size = 227
  action_size = 13
  env = gym.make("Slice-v0")

  torch.cuda.set_device(gpu)

  # Init networks
  policy_network = PolicyNetwork(state_size, action_size).cuda(gpu)
  policy_network = nn.parallel.DistributedDataParallel(policy_network, device_ids=[rank])
  value_network = ValueNetwork(state_size).cuda(gpu)
  value_network = nn.parallel.DistributedDataParallel(value_network, device_ids=[rank])
  ...

My PolicyNetwork and ValueNetwork are simple sequential layers.

Am I using DDP wrong here? I have looked at many online tutorials, but nothing seems address this specific problem. I apologize if this is a rather elementary question. I appreciate your time!

Hey, this error is thrown because the mp.spawn call might not be within an if __name__ == "__main__" guard.
Can you try if this works?

if __name__ == '__main__':
  main()

I added that statement and I think it is running? I am not sure because the progress bar I am using (through the tqdm package) is not showing up, but I have not gotten any errors and it has not broken. I set it to run for 200 epochs, so I suppose I will just have to wait and see if it worked.

Update
After 30 minutes, I got this error message.

[E socket.cpp:793] [c10d] The client socket has timed out after 1800s while trying to connect to (127.0.0.1, 54263).
[E socket.cpp:793] [c10d] The client socket has timed out after 1800s while trying to connect to (127.0.0.1, 54263).
[E socket.cpp:793] [c10d] The client socket has timed out after 1800s while trying to connect to (127.0.0.1, 54263).
[E socket.cpp:793] [c10d] The client socket has timed out after 1800s while trying to connect to (127.0.0.1, 54263).
Traceback (most recent call last):
  File "knot_gpu.py", line 350, in <module>
    main()
  File "knot_gpu.py", line 236, in main
    mp.spawn(ppo_main, nprocs=args.gpus, args=(args,))
  File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 240, in spawn
    return start_processes(fn, args, nprocs, join, daemon, start_method='spawn')
  File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 198, in start_processes
    while not context.join():
  File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 160, in join
    raise ProcessRaisedException(msg, error_index, failed_process.pid)
torch.multiprocessing.spawn.ProcessRaisedException: 

-- Process 2 terminated with the following error:
Traceback (most recent call last):
  File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/multiprocessing/spawn.py", line 69, in _wrap
    fn(i, *args)
  File "/zhome/dskinne3/knot_gpu.py", line 245, in ppo_main
    torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:54263', world_size=args.world_size, rank=rank)
  File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/distributed/distributed_c10d.py", line 595, in init_process_group
    store, rank, world_size = next(rendezvous_iterator)
  File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/distributed/rendezvous.py", line 186, in _tcp_rendezvous_handler
    store = _create_c10d_store(result.hostname, result.port, rank, world_size, timeout)
  File "/fslhome/dskinne3/.local/lib/python3.7/site-packages/torch/distributed/rendezvous.py", line 161, in _create_c10d_store
    hostname, port, world_size, start_daemon, timeout, multi_tenant=True
TimeoutError: The client socket has timed out after 1800s while trying to connect to (127.0.0.1, 54263).

Any thoughts?

  torch.distributed.init_process_group(backend='nccl', init_method='tcp://127.0.0.1:54263', world_size=args.world_size, rank=rank)

Hey @Dylan_Skinner these two configurations are a bit conflicting with each other. The first one is using the host IP and the second one is using localhost. The 2nd one will be the one in use as that’s what explicitly passed to init_process_group. Based on the error below, looks like localhost didn’t work. Can you try using host IP instead?

TimeoutError: The client socket has timed out after 1800s while trying to connect to (127.0.0.1, 54263).

Oh! Sorry about that. I am using my University’s remote super computer and am no good at Linux (I got the first bit of code from one of the system administrators). Should I use the bit of code at os.environ['MASTER_ADDR'] or the local host IP?

If you are using multiple machines, you will have to use the routable host IP that all process can talk to (i.e., not localhost).