And for some of you code and hard numbers might speak better than words, so here is a simplified typical situation where some RAM gets allocated and some gets released:
def consume_gpu_ram(n): return torch.ones((n, n)).cuda()
def consume_gpu_ram_256mb(): return consume_gpu_ram(2**13)
# should be: 256 used, 512 peaked
c1 = consume_gpu_ram_256mb()
c2 = consume_gpu_ram_256mb()
del c1
I want to have a tool that will tell me that when this program has run, the memory consumption peaked at 512MB and then finally stabilized at 256MB (we are talking deltas here). I hope it’s clear to see:
start: -> using 0MB
line 1: allocate 256MB -> using 256MB
line 2: allocate 256MB -> using 512MB
line 3: free 256MB -> using 256MB
There is no way current pytorch tools can tell me that there was a peak of 512MB there, unless it’s the very first code that was run and or no other previously run code ever consumed more RAM than this program requires. So here is a better example:
# part 1: render torch.cuda.max_memory_allocated() useless for future peak computations
# in lower ranges by allocating a much bigger chunk of RAM and then freeing it.
z = [consume_gpu_ram_256mb() for i in range(4)] # 1GB
del z
# part 2: now measure:
# should be: 256 used, 512 peaked
c1 = consume_gpu_ram_256mb()
c2 = consume_gpu_ram_256mb()
del c1
torch.cuda.max_memory_allocated() will now report 1024, when part 2 consumed only 512MB at its peak.
And here is a simple program that shows how cuda tools are lacking, and how the workaround with peak measuring thread does measure the correct things (with a possible small error due to the thread’s unpredictable timing).
You will need pynvml installed: pip/conda install nvidia-ml-py3
import threading, torch, time, pynvml
def preload_pytorch():
torch.ones((1, 1)).cuda()
def gpu_mem_used(id):
handle = pynvml.nvmlDeviceGetHandleByIndex(id)
info = pynvml.nvmlDeviceGetMemoryInfo(handle)
return int(info.used/2**20)
def gpu_mem_used_no_cache(id):
torch.cuda.empty_cache()
return gpu_mem_used(id)
def peak_monitor_start():
global peak_monitoring
peak_monitoring = True
# this thread samples RAM usage as long as the current epoch of the fit loop is running
peak_monitor_thread = threading.Thread(target=peak_monitor_func)
peak_monitor_thread.daemon = True
peak_monitor_thread.start()
def peak_monitor_stop():
global peak_monitoring
peak_monitoring = False
def peak_monitor_func():
global nvml_peak, peak_monitoring
nvml_peak = 0
id = torch.cuda.current_device()
while True:
nvml_peak = max(gpu_mem_used(id), nvml_peak)
if not peak_monitoring: break
time.sleep(0.001) # 1msec
def consume_gpu_ram(n): return torch.ones((n, n)).cuda()
def consume_gpu_ram_256mb(): return consume_gpu_ram(2**13)
peak_monitoring = False
nvml_peak = 0
preload_pytorch()
pynvml.nvmlInit()
id = torch.cuda.current_device()
# push the pytorch's peak gauge high up and then release the memory
z = [consume_gpu_ram_256mb() for i in range(4)] # 1GB
del z
peak_monitor_start()
nvml_before = gpu_mem_used_no_cache(id)
cuda_before = int(torch.cuda.memory_allocated()/2**20)
# should be: 256 used, 512 peaked
c1 = consume_gpu_ram_256mb()
c2 = consume_gpu_ram_256mb()
del c1
# code finished
peak_monitor_stop()
nvml_after = gpu_mem_used_no_cache(id)
cuda_after = int(torch.cuda.memory_allocated()/2**20)
cuda_peak = int(torch.cuda.max_memory_allocated()/2**20)
print("nvml:", nvml_after-nvml_before, nvml_peak-nvml_before)
print("cuda:", cuda_after-cuda_before, cuda_peak-cuda_before)
Output:
nvml: 256 512
cuda: 256 1024
cuda tools can’t give me the right answer of 512MB here.
Now look at the simplicity of tracemalloc doing the same thing:
import tracemalloc, numpy as np
def consume_cpu_ram(n): return np.ones((n, n))
def consume_cpu_ram_128mb(): return consume_cpu_ram(2**12)
# push the process' peak gauge high up and then release the memory
z = [consume_cpu_ram_128mb() for i in range(8)] # 1GB
del z
tracemalloc.start()
# expecting peak requirements of 256MB, and final 128MB
a1 = consume_cpu_ram_128mb()
a2 = consume_cpu_ram_128mb()
del a1
cpu_current, cpu_peak = list(map(lambda x: int(x/2**20), tracemalloc.get_traced_memory()))
tracemalloc.stop()
print(cpu_current, cpu_peak)
Output:
128 256
pytorch could do exactly the same, here is some pseudo-code:
class max_memory_allocated_local():
def start(self):
self.begin = memory_allocated()
max_memory_allocated_local_reset() # put the peak gauge to zero
def stop(self):
self.end = memory_allocated()
self.peak = max_memory_allocated_local()
def get_traced_memory(self):
return self.end-self.begin, self.peak-self.begin
it introduces a local max_memory_allocated_local, which can be reset by the user, but otherwise working the same as max_memory_allocated().