Hello everyone,
I have been running into an inconsistent issue when I am attempting to convert a list/batch of states (dictionaries mapping to ndarrays) to tensors on the GPU. For many approaches I have tried, the program will sometimes increase RAM usage as expected, ~31MiB per episode (500 steps) and then sometimes seems to increase RAM usage at I believe ~31MiB per step. I have created the code snippet below for reproducing the issue.
import numpy as np
import os
import psutil
import torch
import time
import gc
import torch.multiprocessing as mp
process = psutil.Process(os.getpid())
def get_memory():
return process.memory_info().rss / (1024 ** 2)
def create_state() -> dict[str, np.ndarray]:
state = {
"image": np.random.randint(0, 256, (9, 84, 84), dtype=np.uint8),
"vector": np.random.rand(128).astype(np.float32)
}
return state
def create_experience(curr_state: dict[str, np.ndarray], new_state: dict[str, np.ndarray]) -> dict[str, dict[str, np.ndarray] | int | float | bool]:
experience = {
"state": curr_state,
"action": np.random.randint(0, 4),
"reward": np.random.rand(),
"next_state": new_state,
"done": np.random.choice([True, False])
}
return experience
# Reduced with np.stack and with/without torch.no_grad() - FAIL
def stack(states, device, dtype=torch.float32):
states_images = np.stack([s["image"] for s in states])
states_vector = np.stack([s["vector"] for s in states])
with torch.no_grad():
states_images_tensor = torch.as_tensor(states_images, dtype=dtype, device=device) / 255.0
states_vector_tensor = torch.as_tensor(states_vector, dtype=dtype, device=device)
return states_images_tensor, states_vector_tensor
# Preallocate numpy arrays then transfer to gpu using torch.tensor - WORKS
def prealloc_np(states, device, dtype=torch.float32):
n = len(states)
img_shape = states[0]["image"].shape
vec_shape = states[0]["vector"].shape
images = np.empty((n, *img_shape), dtype=np.float32)
vectors = np.empty((n, *vec_shape), dtype=np.float32)
for i in range(n):
images[i] = states[i]["image"]
vectors[i] = states[i]["vector"]
states_images_tensor = torch.tensor(images, device=device, dtype=dtype) / 255.0
states_vector_tensor = torch.tensor(vectors, device=device, dtype=dtype)
return states_images_tensor, states_vector_tensor
def update_network(func: callable, memory_buffer: list[dict[str, np.ndarray]], current_buffer_size: int, batch_size: int = 128):
indices = np.random.choice(current_buffer_size + 1, size=batch_size, replace=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
states = [memory_buffer[i]["state"] for i in indices]
func(states, device=device)
def train(rank, func: callable, buffer_size: int, num_steps_exploration: int, num_training_steps: int):
memory_buffer = [None] * buffer_size
curr_state = create_state()
torch.cuda.empty_cache()
gc.collect()
print(f"{get_memory():.2f}")
start = time.perf_counter_ns()
for i in range(num_steps_exploration):
new_state = create_state()
experience = create_experience(curr_state, new_state)
memory_buffer[i % buffer_size] = experience
curr_state = new_state
for i in range(num_steps_exploration, num_training_steps):
new_state = create_state()
experience = create_experience(curr_state, new_state)
memory_buffer[i % buffer_size] = experience
curr_state = new_state
update_network(func, memory_buffer, current_buffer_size=i)
if (i + 1) % 500 == 0:
print(f"{get_memory():.2f}")
end = time.perf_counter_ns()
print(f"{(end - start) / 1e9:.2f}")
def main():
functions = [stack]
num_training_steps: int = 4_000
num_steps_exploration: int = 2_000
buffer_size: int = num_training_steps
num_trials: int = 1
print(f"PID: {os.getpid()}")
for func in functions:
print(f"Using function: {func.__name__}")
for trial in range(num_trials):
print(f"Round {trial + 1}")
mp.spawn(train, args=(func, buffer_size, num_steps_exploration, num_training_steps), nprocs=1)
if __name__ == "__main__":
if not torch.cuda.is_available():
print("CUDA is not available. Please check your installation.")
else:
main()
In the snippet is the original function stack() and my current solution prealloc_np(). Unfortunately, I have been unable to determine exactly what the issue is and have no guarantee that my solution is valid.
I have ‘fixed’ the issue also by:
- Calling
[s[“image”].copy() for s in states] - Calling
torch.cuda.empty_cache()after creating the GPU tensor - Using
.to(device)withnon_blocking=True - Pre-allocating a GPU tensor with
torch.empty()and then copying images across one at a time
I have tried using psutil, tracemalloc, cuda.memory_stats() and memray to locate the cause for the memory leak but have failed miserably. Additionally, I have tried using torch 2.7 and 2.9 and have tried on multiple machines running on Ubuntu 22.04 with the same inconsistent operation.