Apparent RAM memory leak when converting batch of ndarray states to GPU tensor

Hello everyone,
I have been running into an inconsistent issue when I am attempting to convert a list/batch of states (dictionaries mapping to ndarrays) to tensors on the GPU. For many approaches I have tried, the program will sometimes increase RAM usage as expected, ~31MiB per episode (500 steps) and then sometimes seems to increase RAM usage at I believe ~31MiB per step. I have created the code snippet below for reproducing the issue.

import numpy as np
import os
import psutil
import torch
import time
import gc
import torch.multiprocessing as mp

process = psutil.Process(os.getpid())

def get_memory():
    return process.memory_info().rss / (1024 ** 2)

def create_state() -> dict[str, np.ndarray]:
    state = {
        "image": np.random.randint(0, 256, (9, 84, 84), dtype=np.uint8),
        "vector": np.random.rand(128).astype(np.float32)
    }
    return state

def create_experience(curr_state: dict[str, np.ndarray], new_state: dict[str, np.ndarray]) -> dict[str, dict[str, np.ndarray] | int | float | bool]:
    experience = {
        "state": curr_state,
        "action": np.random.randint(0, 4),
        "reward": np.random.rand(),
        "next_state": new_state,
        "done": np.random.choice([True, False])
    }
    return experience

# Reduced with np.stack and with/without torch.no_grad() - FAIL
def stack(states, device, dtype=torch.float32):
    states_images = np.stack([s["image"] for s in states])
    states_vector = np.stack([s["vector"] for s in states])
    with torch.no_grad():
        states_images_tensor = torch.as_tensor(states_images, dtype=dtype, device=device) / 255.0
        states_vector_tensor = torch.as_tensor(states_vector, dtype=dtype, device=device)
    return states_images_tensor, states_vector_tensor

# Preallocate numpy arrays then transfer to gpu using torch.tensor - WORKS
def prealloc_np(states, device, dtype=torch.float32):
    n = len(states)
    img_shape = states[0]["image"].shape
    vec_shape = states[0]["vector"].shape

    images = np.empty((n, *img_shape), dtype=np.float32)
    vectors = np.empty((n, *vec_shape), dtype=np.float32)

    for i in range(n):
        images[i] = states[i]["image"]
        vectors[i] = states[i]["vector"]

    states_images_tensor = torch.tensor(images, device=device, dtype=dtype) / 255.0
    states_vector_tensor = torch.tensor(vectors, device=device, dtype=dtype)

    return states_images_tensor, states_vector_tensor


def update_network(func: callable, memory_buffer: list[dict[str, np.ndarray]], current_buffer_size: int, batch_size: int = 128):
    indices = np.random.choice(current_buffer_size + 1, size=batch_size, replace=False)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    states = [memory_buffer[i]["state"] for i in indices]
    func(states, device=device)

def train(rank, func: callable, buffer_size: int, num_steps_exploration: int, num_training_steps: int):
    memory_buffer = [None] * buffer_size
    curr_state = create_state()

    torch.cuda.empty_cache()
    gc.collect()
    print(f"{get_memory():.2f}")
    start = time.perf_counter_ns()

    for i in range(num_steps_exploration):
        new_state = create_state()
        experience = create_experience(curr_state, new_state)
        memory_buffer[i % buffer_size] = experience
        curr_state = new_state

    for i in range(num_steps_exploration, num_training_steps):
        new_state = create_state()
        experience = create_experience(curr_state, new_state)
        memory_buffer[i % buffer_size] = experience
        curr_state = new_state
        update_network(func, memory_buffer, current_buffer_size=i)

        if (i + 1) % 500 == 0:
            print(f"{get_memory():.2f}")

    
    end = time.perf_counter_ns()
    print(f"{(end - start) / 1e9:.2f}")
    
def main():
    functions = [stack]
    num_training_steps: int = 4_000
    num_steps_exploration: int = 2_000
    buffer_size: int = num_training_steps
    num_trials: int = 1

    print(f"PID: {os.getpid()}")

    for func in functions:
        print(f"Using function: {func.__name__}")

        for trial in range(num_trials):
            print(f"Round {trial + 1}")
            mp.spawn(train, args=(func, buffer_size, num_steps_exploration, num_training_steps), nprocs=1)

if __name__ == "__main__":
    if not torch.cuda.is_available():
        print("CUDA is not available. Please check your installation.")
    else:
        main()

In the snippet is the original function stack() and my current solution prealloc_np(). Unfortunately, I have been unable to determine exactly what the issue is and have no guarantee that my solution is valid.

I have ‘fixed’ the issue also by:

  1. Calling [s[“image”].copy() for s in states]
  2. Calling torch.cuda.empty_cache() after creating the GPU tensor
  3. Using .to(device) with non_blocking=True
  4. Pre-allocating a GPU tensor with torch.empty() and then copying images across one at a time

I have tried using psutil, tracemalloc, cuda.memory_stats() and memray to locate the cause for the memory leak but have failed miserably. Additionally, I have tried using torch 2.7 and 2.9 and have tried on multiple machines running on Ubuntu 22.04 with the same inconsistent operation.