NCCL hangs in pytorch

import torch
import torch.distributed as dist
import os

def setup(rank, world_size):
    os.environ['MASTER_ADDR'] = 'localhost'
    os.environ['MASTER_PORT'] = '12355'
    dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():
    dist.destroy_process_group()

def nccl_allgather(rank, world_size):
    setup(rank, world_size)

    tensor = torch.full((10,), rank, dtype=torch.float32).cuda(rank)
    print(f"Rank {rank} has data {tensor}")

    gathered_tensor = [torch.zeros_like(tensor) for _ in range(world_size)]

    dist.all_gather(gathered_tensor, tensor, async_op=True)

    gathered_tensor = torch.cat(gathered_tensor, dim=0)
    print(f"Gathered on rank {rank}: {gathered_tensor}")

    cleanup()

def main():
    world_size = torch.cuda.device_count()
    torch.multiprocessing.spawn(nccl_allgather, args=(world_size,), nprocs=world_size, join=True)

if __name__ == "__main__":
    i = 0
    import os 
    import time
    print(os.getpid())
    os.environ["NCCL_DEBUG"] = "INFO"
    os.environ["NCCL_DEBUG_SUBSYS"] = "GRAPH"

    os.sched_setscheduler(0, os.SCHED_RR, os.sched_param(10))



main()

I have a node with four gpus. When i execute the above code and allocate 4 CPU cores, the code runs perfectly. However, when i execute the above code with 3 CPU cores, it hangs.
This does not happen when the scheduling class is set to CFS.