DDP "Hello World" failing. Help!

I’m running the “Hello World” example below using this command on a SLURM cluster (1 node, 3 GPUs).

$ CUDA_VISIBLE_DEVICES=0,1,2 python -m torch.distributed.launch --nproc_per_node=3 main.py

Output

However, it fails with this error:

[/home/<>/...] $ CUDA_VISIBLE_DEVICES=0,1,2 python -m torch.distributed.launch --nproc_per_node=3 main.py
/home/<>/.local/lib/python3.9/site-packages/torch/distributed/launch.py:178: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects `--local_rank` argument to be set, please
change it to read from `os.environ['LOCAL_RANK']` instead. See 
https://pytorch.org/docs/stable/distributed.html#launch-utility for 
further instructions

  warnings.warn(
WARNING:torch.distributed.run:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
*****************************************
Current rank: 1
Current rank: 0
Current rank: 2
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 2 (pid: 15367) of binary: /apps/pytorch/1.8.1/bin/python
Traceback (most recent call last):
  File "/apps/pytorch/1.8.1/lib/python3.9/runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/apps/pytorch/1.8.1/lib/python3.9/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launch.py", line 193, in <module>
    main()
  File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launch.py", line 189, in main
    launch(args)
  File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launch.py", line 174, in launch
    run(args)
  File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/run.py", line 710, in run
    elastic_launch(
  File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
    return launch_agent(self._config, self._entrypoint, list(args))
  File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
    raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError: 
======================================================
main.py FAILED
------------------------------------------------------
Failures:
  <NO_OTHER_FAILURES>
------------------------------------------------------
Root Cause (first observed failure):
[0]:
  time      : 2022-05-16_10:06:35
  host      : c44a-s17.ufhpc
  rank      : 2 (local_rank: 2)
  exitcode  : -9 (pid: 15367)
  error_file: <N/A>
  traceback : Signal 9 (SIGKILL) received by PID 15367
======================================================

Script

def set_random_seeds(random_seed=0):
    torch.manual_seed(random_seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    np.random.seed(random_seed)
    random.seed(random_seed)


def main():
    num_epochs_default = 10000
    batch_size_default = 256 # 1024
    learning_rate_default = 0.1
    random_seed_default = 0
    model_dir_default = "saved_models"
    model_filename_default = "resnet_distributed.pth"

    # Each process runs on 1 GPU device specified by the local_rank argument.
    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
    parser.add_argument("--num_epochs", type=int, help="Number of training epochs.", default=num_epochs_default)
    parser.add_argument("--batch_size", type=int, help="Training batch size for one process.", default=batch_size_default)
    parser.add_argument("--learning_rate", type=float, help="Learning rate.", default=learning_rate_default)
    parser.add_argument("--random_seed", type=int, help="Random seed.", default=random_seed_default)
    parser.add_argument("--model_dir", type=str, help="Directory for saving models.", default=model_dir_default)
    parser.add_argument("--model_filename", type=str, help="Model filename.", default=model_filename_default)
    parser.add_argument("--resume", action="store_true", help="Resume training from saved checkpoint.")
    argv = parser.parse_args()

    local_rank = argv.local_rank
    num_epochs = argv.num_epochs
    batch_size = argv.batch_size
    learning_rate = argv.learning_rate
    random_seed = argv.random_seed
    model_dir = argv.model_dir
    model_filename = argv.model_filename
    resume = argv.resume

    print(f"Current rank: {local_rank}")

    model_filepath = os.path.join(model_dir, model_filename)
    set_random_seeds(random_seed=random_seed)
    torch.distributed.init_process_group(backend="nccl", world_size=torch.cuda.device_count())
    model = torchvision.models.resnet18(pretrained=False)
    device = torch.device("cuda:{}".format(local_rank))


if __name__ == "__main__":
    main()