I am attempting to use symmetric memory for all reduces, specifically the one shot kernel but I am hitting errors. There doesn’t seem to be much documentation on these features and I was wondering if there are any examples I can take a look at to use. Below is a small reproducer of the errors that I am seeing:
import os
import torch
import torch.distributed as torch_dist
import torch.distributed._symmetric_memory as symm_mem
def main():
# Initialize the distributed environment
local_rank = int(os.environ.get("LOCAL_RANK", 0))
rank = int(os.environ.get("RANK", 0))
world_size = int(os.environ.get("WORLD_SIZE", 1))
# Set the device
device = torch.device(f"cuda:{local_rank}")
torch.cuda.set_device(device)
# Initialize the process group
torch_dist.init_process_group("nccl")
# Create an explicit process group with all processes
ranks = list(range(world_size))
explicit_group = torch_dist.new_group(ranks=ranks)
# Get the group name from the explicit group
group_name = explicit_group.group_name
if rank == 0:
print(f"Created group with name: {group_name}")
# Create a simple tensor with values equal to the rank
tensor_size = 1024
tensor_regular = torch.ones(tensor_size, device=device) * rank
tensor_copy = tensor_regular.clone()
# Regular all-reduce using the explicit group
torch_dist.all_reduce(tensor_regular, op=torch_dist.ReduceOp.SUM, group=explicit_group)
try:
if rank == 0:
print(f"Using group name for symmetric memory: {group_name}")
# Create symmetric memory tensor
msg = symm_mem.empty(
tensor_size,
dtype=torch.float32,
device=device,
)
# Perform rendezvous with the explicit group's name
symm_mem.rendezvous(msg, group_name)
# Copy input data
msg.copy_(tensor_copy)
# Perform one-shot all-reduce with the explicit group's name
torch.ops.symm_mem.one_shot_all_reduce(
msg,
"sum",
group_name
)
except Exception as e:
print(f"[Rank {rank}] Error: {e}")
import traceback
traceback.print_exc()
# Clean up
torch_dist.destroy_process_group()
if __name__ == "__main__":
main()
This is giving me the error even though I am passing group names.
RuntimeError: CUDASymmetricMemory::rendezvous: `group_name` is neither specified during allocation nor passed to rendezvous().