Using Symmetric Memory One Shot All Reduce

I am attempting to use symmetric memory for all reduces, specifically the one shot kernel but I am hitting errors. There doesn’t seem to be much documentation on these features and I was wondering if there are any examples I can take a look at to use. Below is a small reproducer of the errors that I am seeing:

import os
import torch
import torch.distributed as torch_dist
import torch.distributed._symmetric_memory as symm_mem

def main():
    # Initialize the distributed environment
    local_rank = int(os.environ.get("LOCAL_RANK", 0))
    rank = int(os.environ.get("RANK", 0))
    world_size = int(os.environ.get("WORLD_SIZE", 1))
    
    # Set the device
    device = torch.device(f"cuda:{local_rank}")
    torch.cuda.set_device(device)
    
    # Initialize the process group
    torch_dist.init_process_group("nccl")
    
    # Create an explicit process group with all processes
    ranks = list(range(world_size))
    explicit_group = torch_dist.new_group(ranks=ranks)
    
    # Get the group name from the explicit group
    group_name = explicit_group.group_name
    
    if rank == 0:
        print(f"Created group with name: {group_name}")
    
    # Create a simple tensor with values equal to the rank
    tensor_size = 1024
    tensor_regular = torch.ones(tensor_size, device=device) * rank
    tensor_copy = tensor_regular.clone()
    
    # Regular all-reduce using the explicit group
    torch_dist.all_reduce(tensor_regular, op=torch_dist.ReduceOp.SUM, group=explicit_group)
    
    try:
        if rank == 0:
            print(f"Using group name for symmetric memory: {group_name}")
        
        # Create symmetric memory tensor 
        msg = symm_mem.empty(
            tensor_size,
            dtype=torch.float32,
            device=device,
        )
        
        # Perform rendezvous with the explicit group's name
        symm_mem.rendezvous(msg, group_name)
        
        # Copy input data
        msg.copy_(tensor_copy)
        
        # Perform one-shot all-reduce with the explicit group's name
        torch.ops.symm_mem.one_shot_all_reduce(
            msg,
            "sum",
            group_name
        )
        
    except Exception as e:
        print(f"[Rank {rank}] Error: {e}")
        import traceback
        traceback.print_exc()
    
    # Clean up
    torch_dist.destroy_process_group()

if __name__ == "__main__":
    main()

This is giving me the error even though I am passing group names.

RuntimeError: CUDASymmetricMemory::rendezvous: `group_name` is neither specified during allocation nor passed to rendezvous().