Memory leaking while using shared tensor

for the following code:

import torch
import torch.distributed as dist
import os
from multiprocessing import shared_memory

def main():
    dist.init_process_group(backend="nccl")

    local_rank = int(os.environ["LOCAL_RANK"])
    global_rank = int(os.environ["RANK"])
    world_size = int(os.environ["WORLD_SIZE"])

    torch.cuda.set_device(local_rank)
    
    tensor_shape = (world_size, 5)
    dtype = torch.float32
    element_size = torch.tensor([], dtype=dtype).element_size()
    num_bytes = element_size * tensor_shape[0] * tensor_shape[1]

    if global_rank == 0:
        # Rank 0 creates the shared memory
        shm = shared_memory.SharedMemory(name="shared_tensor", create=True, size=num_bytes)
        shared_tensor = torch.frombuffer(shm.buf, dtype=dtype).reshape(tensor_shape)
        shared_tensor.zero_()  # Initialize
    else:
        # Other ranks connect to the shared memory
        shm = shared_memory.SharedMemory(name="shared_tensor", create=False)
        shared_tensor = torch.frombuffer(shm.buf, dtype=dtype).reshape(tensor_shape)

    # Modify the shared tensor
    if global_rank == 0:
        shared_tensor[dist.get_rank()] += 5

    dist.barrier()
    shared_tensor[dist.get_rank()] -= 2

    dist.barrier()

    # Print results
    print(f"   world_rank:{dist.get_rank()}, shared_tensor dev:{shared_tensor.device}, ptr:{shared_tensor.data_ptr()} ")
    if global_rank == 0:
        print(f"Final shared tensor (rank 0): {shared_tensor}")
    
    
    # Cleanup shared memory (only rank 0 unlinks)
    #shm.close()
    dist.barrier()
    
    shm.close()
    
    dist.barrier()

    if global_rank == 0:

        shm.unlink()    

    dist.destroy_process_group()



if __name__ == "__main__":
    main()

we are getting the following errors:

**torchrun --nproc_per_node=2 --nnodes=1 --node_rank=0 --master_addr="172.16.28.10" --master_port=12346 mp_shared_mem.py** 


W1213 16:56:58.803000 139837017543104 torch/distributed/run.py:779] 
W1213 16:56:58.803000 139837017543104 torch/distributed/run.py:779] *****************************************
W1213 16:56:58.803000 139837017543104 torch/distributed/run.py:779] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1213 16:56:58.803000 139837017543104 torch/distributed/run.py:779] *****************************************
   world_rank:0, shared_tensor dev:cpu, ptr:140225387536384 
   world_rank:1, shared_tensor dev:cpu, ptr:140405073473536 
Final shared tensor (rank 0): tensor([[ 3.,  3.,  3.,  3.,  3.],
        [-2., -2., -2., -2., -2.]])
/usr/lib/python3.10/multiprocessing/resource_tracker.py:224: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown
  warnings.warn('resource_tracker: There appear to be %d '
/usr/lib/python3.10/multiprocessing/resource_tracker.py:237: UserWarning: resource_tracker: '/shared_tensor': [Errno 2] No such file or directory: '/shared_tensor'
  warnings.warn('resource_tracker: %r: %s' % (name, e))

We are trying to create shared_tensors in the CPU, so that each process can separately write on them (from GPU tensors)
We can use the cpu to cmpute, and then the processes would read them back into GPU. We want to avoid use gloo backend(dist.all_gather/…)
But before that we wanted to test if we can use shared_memory at all. It seems the memory is always leaking.