I got 2 gpu in 1 node, P100, and V100
I want to assign 1 process to 1 GPU. But this code fails.
It shows problem assigning to different ranks.
Running it without assigning to specific devices works.
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from typing import Callable
import os
import torch
import torch.distributed as dist
import torch.multiprocessing as mp
from typing import Callable
def init_process(rank: int, size: int, fn: Callable[[int, int], None], backend="nccl"):
"""Initialize the distributed environment."""
os.environ["MASTER_ADDR"] = "127.0.0.1"
os.environ["MASTER_PORT"] = "29505" # Ensure the port is not in use
os.environ["CUDA_VISIBLE_DEVICES"] = str(rank) # Assign each process to a specific GPU
try:
dist.init_process_group(backend, rank=rank, world_size=size)
torch.cuda.set_device(rank) # Set the current device
fn(rank, size)
finally:
dist.destroy_process_group() # Ensure proper cleanup
def do_all_reduce(rank: int, size: int):
"""Perform an all-reduce operation."""
group = dist.new_group(list(range(size)))
tensor = torch.ones(1).cuda(rank) # Make sure the tensor is on the correct GPU
dist.all_reduce(tensor, op=dist.ReduceOp.SUM, group=group)
print(f"[Rank {rank}] All-Reduce result: {tensor[0].item()} on GPU {rank}")
if __name__ == "__main__":
size = 2 # Number of processes / GPUs
mp.set_start_method("spawn")
processes = []
for rank in range(size):
if rank >= torch.cuda.device_count():
raise ValueError(f"Invalid GPU rank: {rank}. Number of GPUs available: {torch.cuda.device_count()}")
p = mp.Process(target=init_process, args=(rank, size, do_all_reduce, "nccl"))
p.start()
processes.append(p)
for p in processes:
p.join()