Failure to connect dist process to individual GPU

I got 2 gpu in 1 node, P100, and V100
I want to assign 1 process to 1 GPU. But this code fails.
It shows problem assigning to different ranks.
Running it without assigning to specific devices works.


import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from typing import Callable



import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from typing import Callable

def init_process(rank: int, size: int, fn: Callable[[int, int], None], backend="nccl"):
    """Initialize the distributed environment."""
    os.environ["MASTER_ADDR"] = "127.0.0.1"
    os.environ["MASTER_PORT"] = "29505"  # Ensure the port is not in use
    os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)  # Assign each process to a specific GPU
    try:
        dist.init_process_group(backend, rank=rank, world_size=size)
        torch.cuda.set_device(rank)  # Set the current device
        fn(rank, size)
    finally:
        dist.destroy_process_group()  # Ensure proper cleanup

def do_all_reduce(rank: int, size: int):
    """Perform an all-reduce operation."""
    group = dist.new_group(list(range(size)))
    tensor = torch.ones(1).cuda(rank)  # Make sure the tensor is on the correct GPU
    dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)
    print(f"[Rank {rank}] All-Reduce result: {tensor[0].item()} on GPU {rank}")

if __name__ == "__main__":
    size = 2  # Number of processes / GPUs
    mp.set_start_method("spawn")
    processes = []

    for rank in range(size):
        if rank >= torch.cuda.device_count():
            raise ValueError(f"Invalid GPU rank: {rank}. Number of GPUs available: {torch.cuda.device_count()}")
        p = mp.Process(target=init_process, args=(rank, size, do_all_reduce, "nccl"))
        p.start()
        processes.append(p)

    for p in processes:
        p.join()

Maybe it’s because you do this:

os.environ["CUDA_VISIBLE_DEVICES"] = str(rank)

Usually we do set visible_devices same across ranks with a list of GPUs like [0,1, …]

Is this local_rank or global rank?