I’m running the “Hello World” example below using this command on a SLURM cluster (1 node, 3 GPUs).
$ CUDA_VISIBLE_DEVICES=0,1,2 python -m torch.distributed.launch --nproc_per_node=3 main.py
Output
However, it fails with this error:
[/home/<>/...] $ CUDA_VISIBLE_DEVICES=0,1,2 python -m torch.distributed.launch --nproc_per_node=3 main.py
/home/<>/.local/lib/python3.9/site-packages/torch/distributed/launch.py:178: FutureWarning: The module torch.distributed.launch is deprecated
and will be removed in future. Use torchrun.
Note that --use_env is set by default in torchrun.
If your script expects `--local_rank` argument to be set, please
change it to read from `os.environ['LOCAL_RANK']` instead. See
https://pytorch.org/docs/stable/distributed.html#launch-utility for
further instructions
warnings.warn(
WARNING:torch.distributed.run:
*****************************************
Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed.
*****************************************
Current rank: 1
Current rank: 0
Current rank: 2
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: -9) local_rank: 2 (pid: 15367) of binary: /apps/pytorch/1.8.1/bin/python
Traceback (most recent call last):
File "/apps/pytorch/1.8.1/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/apps/pytorch/1.8.1/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launch.py", line 193, in <module>
main()
File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launch.py", line 189, in main
launch(args)
File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launch.py", line 174, in launch
run(args)
File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/run.py", line 710, in run
elastic_launch(
File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 131, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/<username>/.local/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 259, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
======================================================
main.py FAILED
------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2022-05-16_10:06:35
host : c44a-s17.ufhpc
rank : 2 (local_rank: 2)
exitcode : -9 (pid: 15367)
error_file: <N/A>
traceback : Signal 9 (SIGKILL) received by PID 15367
======================================================
Script
def set_random_seeds(random_seed=0):
torch.manual_seed(random_seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(random_seed)
random.seed(random_seed)
def main():
num_epochs_default = 10000
batch_size_default = 256 # 1024
learning_rate_default = 0.1
random_seed_default = 0
model_dir_default = "saved_models"
model_filename_default = "resnet_distributed.pth"
# Each process runs on 1 GPU device specified by the local_rank argument.
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("--local_rank", type=int, help="Local rank. Necessary for using the torch.distributed.launch utility.")
parser.add_argument("--num_epochs", type=int, help="Number of training epochs.", default=num_epochs_default)
parser.add_argument("--batch_size", type=int, help="Training batch size for one process.", default=batch_size_default)
parser.add_argument("--learning_rate", type=float, help="Learning rate.", default=learning_rate_default)
parser.add_argument("--random_seed", type=int, help="Random seed.", default=random_seed_default)
parser.add_argument("--model_dir", type=str, help="Directory for saving models.", default=model_dir_default)
parser.add_argument("--model_filename", type=str, help="Model filename.", default=model_filename_default)
parser.add_argument("--resume", action="store_true", help="Resume training from saved checkpoint.")
argv = parser.parse_args()
local_rank = argv.local_rank
num_epochs = argv.num_epochs
batch_size = argv.batch_size
learning_rate = argv.learning_rate
random_seed = argv.random_seed
model_dir = argv.model_dir
model_filename = argv.model_filename
resume = argv.resume
print(f"Current rank: {local_rank}")
model_filepath = os.path.join(model_dir, model_filename)
set_random_seeds(random_seed=random_seed)
torch.distributed.init_process_group(backend="nccl", world_size=torch.cuda.device_count())
model = torchvision.models.resnet18(pretrained=False)
device = torch.device("cuda:{}".format(local_rank))
if __name__ == "__main__":
main()