import torch
import torch.distributed as dist
import os
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
def nccl_allgather(rank, world_size):
setup(rank, world_size)
tensor = torch.full((10,), rank, dtype=torch.float32).cuda(rank)
print(f"Rank {rank} has data {tensor}")
gathered_tensor = [torch.zeros_like(tensor) for _ in range(world_size)]
dist.all_gather(gathered_tensor, tensor, async_op=True)
gathered_tensor = torch.cat(gathered_tensor, dim=0)
print(f"Gathered on rank {rank}: {gathered_tensor}")
cleanup()
def main():
world_size = torch.cuda.device_count()
torch.multiprocessing.spawn(nccl_allgather, args=(world_size,), nprocs=world_size, join=True)
if __name__ == "__main__":
i = 0
import os
import time
print(os.getpid())
os.environ["NCCL_DEBUG"] = "INFO"
os.environ["NCCL_DEBUG_SUBSYS"] = "GRAPH"
os.sched_setscheduler(0, os.SCHED_RR, os.sched_param(10))
main()
I have a node with four gpus. When i execute the above code and allocate 4 CPU cores, the code runs perfectly. However, when i execute the above code with 3 CPU cores, it hangs.
This does not happen when the scheduling class is set to CFS.