I’d like to share hyper-parameters sampled in a process and send it to other processes.
I’m running in a slurm environment and I’ve attached a minimal example hereafter.
Using both all_gather_object
and broadcast_object_list
I get this error:
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1191, invalid usage, NCCL version 2.10.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
broadcast.py
import os
import torch
import subprocess
def setup(config):
node_list = os.environ.get("SLURM_STEP_NODELIST")
if node_list is None:
node_list = os.environ.get("SLURM_JOB_NODELIST")
hostnames = subprocess.check_output(["scontrol", "show", "hostnames", node_list])
config["init_method"] = "tcp://{host}:{port}".format(
host=hostnames.split()[0].decode("utf-8"),
port=config["distributed_port"],
)
config["rank"] = int(os.environ.get("SLURM_PROCID"))
config["local_rank"] = int(os.environ.get("SLURM_LOCALID"))
torch.distributed.init_process_group(
backend=config["distributed_backend"],
init_method=config["init_method"],
world_size=config["world_size"],
rank=config["rank"],
)
return config
def f(**kwargs):
config = kwargs.pop("config")
hparams = kwargs
print("pre-broadcast hparams", config["rank"], hparams)
obj_list = [None for _ in range(config["world_size"])]
torch.distributed.all_gather_object(obj_list, hparams)
print("post-broadcast hparams", config["rank"], obj_list)
if __name__ == "__main__":
config = setup(
{
"world_size": 2,
"distributed": True,
"distributed_port": os.environ.get("MASTER_PORT", 13356),
"distributed_backend": "nccl",
}
)
print(config)
hp = {}
if config["rank"] == 0:
hp = {"a": 1, "b": {"c": 2}}
hp["config"] = config
f(**hp)
sbatch.sh
#!/bin/bash
#SBATCH --job-name=test-broadcast
#SBATCH --output=test-broadcast.out
#SBATCH --error=test-broadcast.err
#SBATCH --gres=gpu:2
#SBATCH --ntasks-per-node=2
#SBATCH --cpus-per-task=1
#SBATCH --time=00:01:00
#SBATCH --partition=main
#SBATCH --mem=10G
module load anaconda/3
module load cuda/11.2
conda activate ocp-a100
export MASTER_PORT=$(expr 10000 + $(echo -n $SLURM_JOBID | tail -c 4))
echo "Master port $MASTER_PORT"
srun --output=test-broadcast-%t.out python broadcast.py
I get 2 outputs:
Task 0:
{'world_size': 2, 'distributed_port': '18489', 'distributed_backend': 'nccl', 'init_method': 'tcp://cn-g005:18489', 'rank': 0, 'local_rank': 0}
pre-broadcast hparams 0 {'a': 1, 'b': {'c': 2}}
Traceback (most recent call last):
File "broadcast.py", line 49, in <module>
f(**hp)
File "broadcast.py", line 32, in f
torch.distributed.all_gather_object(obj_list, hparams)
File "/home/mila/s/schmidtv/.conda/envs/ocp-a100/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1657, in all_gather_object
all_gather(object_size_list, local_size, group=group)
File "/home/mila/s/schmidtv/.conda/envs/ocp-a100/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 2068, in all_gather
work = default_pg.allgather([tensor_list], [tensor])
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1191, invalid usage, NCCL version 2.10.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).
Task 1
{'world_size': 2, 'distributed_port': '18489', 'distributed_backend': 'nccl', 'init_method': 'tcp://cn-g005:18489', 'rank': 1, 'local_rank': 1}
pre-broadcast hparams 1 {}
Traceback (most recent call last):
File "broadcast.py", line 49, in <module>
f(**hp)
File "broadcast.py", line 32, in f
torch.distributed.all_gather_object(obj_list, hparams)
File "/home/mila/s/schmidtv/.conda/envs/ocp-a100/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 1657, in all_gather_object
all_gather(object_size_list, local_size, group=group)
File "/home/mila/s/schmidtv/.conda/envs/ocp-a100/lib/python3.8/site-packages/torch/distributed/distributed_c10d.py", line 2068, in all_gather
work = default_pg.allgather([tensor_list], [tensor])
RuntimeError: NCCL error in: ../torch/csrc/distributed/c10d/ProcessGroupNCCL.cpp:1191, invalid usage, NCCL version 2.10.3
ncclInvalidUsage: This usually reflects invalid usage of NCCL library (such as too many async ops, too many collectives at once, mixing streams in a group, etc).