excellent thanks! Ill check your notebooks.
A couple simple tests tell me. Unless nvidia-smi is incorrect (unlikely)
- pytorch doesnt seem to be aware of memory allocated simply by initializing cuda
- cache use is reported differently by nvidia smi than by pytorch
- i thought max allocated memory was related to future allocation for gradient use, but for some reason it seems to appear even if i specifically call requires_grad=False, and doesnt go away by deleting tensors.
I agree with you on need; and am curious on how pytorch measures and cleans up cuda memory. Torch is deceptively simple with regards to cuda, and yet, paying attention at cleaning up memory is more important that appears at a glance.
# torch.__version__ 1.1.0
import torch
import subprocess as sp
def report(msg, nv,pt,tm):
nv.append(int(sp.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv,nounits,noheader'], encoding='utf-8').split('\n')[0]))
tm.append(torch.cuda.max_memory_allocated()//2**20)
pt.append(torch.cuda.memory_allocated()//2**20)
print(f"\n{msg}")
print(f" nvida smi:\t{nv[-1]} MB\ttotal torch: {nv[-1] - nv[0]}")
print(f" torch mem :\t{pt[-1]} MB\ttotal torch: {pt[-1] - pt[0]}")
print(f" torch maxmem:\t{tm[-1]} MB\ttotal torch: {tm[-1] - tm[0]}")
return nv, tm, pt
def check_mem(overwrite=True):
nv, tm, pt = report("init",[],[],[])
t = torch.empty(1, device="cuda:0", requires_grad=False)
nv, tm, pt = report("cuda empty tensor no grad size 1 added", nv,pt,tm)
T = torch.randn((10,3,1024,1024), device="cuda", requires_grad=False)
nv, tm, pt = report("torch.rndn((10,3,1024,1024)) no grad", nv,pt,tm)
G = (T.mul(T.new([[[[0.2989]], [[0.5870]], [[0.1140]]]]))).sum(1)
if overwrite:
T = torch.stack([G, G, G], dim=1)
T.requires_grad = False
nv, tm, pt = report("stack and overwrite", nv,pt,tm)
del G
else:
T1 = torch.stack([G, G, G], dim=1)
T1.requires_grad = False
del T
del G
nv, tm, pt = report("stack and delete and replace", nv,pt,tm)
torch.cuda.empty_cache()
nv, tm, pt = report("empty cache", nv,pt,tm)
if overwrite:
return T
return T1
check_mem(False)
"""
init
nvida smi: 2710 MB total torch: 0
torch mem : 0 MB total torch: 0
torch maxmem: 0 MB total torch: 0
cuda empty tensor no grad size 1 added
nvida smi: 3155 MB total torch: 445
torch mem : 0 MB total torch: 0
torch maxmem: 0 MB total torch: 0
torch.rndn((10,3,1024,1024)) no grad
nvida smi: 3275 MB total torch: 565
torch mem : 120 MB total torch: 120
torch maxmem: 120 MB total torch: 120
stack and overwrite
nvida smi: 3435 MB total torch: 725
torch mem : 160 MB total torch: 160
torch maxmem: 280 MB total torch: 280
empty cache
nvida smi: 3275 MB total torch: 565
torch mem : 120 MB total torch: 120
torch maxmem: 280 MB total torch: 280
"""