If a input a is use in two loss: loss1 and loss2, the “loss1.backward()” will release input gpu memory? If not, then I run “del loss2” afterwards, will the memory of the input be released? Or I must run “loss2.backward()” to release the memory of input?
I think the below bit of code is a nice explanation to your question and surrounding use cases.
import torch
import gc
import time
def print_gpu_memory():
"""Print GPU memory usage."""
if torch.cuda.is_available():
print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
else:
print("CUDA not available")
def test_memory_behavior(case):
"""Test memory behavior for different scenarios."""
print(f"\n==== Testing Case {case} ====")
# Clear memory
torch.cuda.empty_cache()
gc.collect()
print_gpu_memory()
# Create a large tensor with requires_grad=True
print("Creating input tensor...")
a = torch.randn(5000, 5000, device='cuda', requires_grad=True)
print_gpu_memory()
# Create two losses using the same input
print("Creating two losses...")
loss1 = a.sum() * 2
loss2 = a.mean() * 3
print_gpu_memory()
if case == 1:
# Case 1: Just call loss1.backward() and check memory
print("Calling loss1.backward()...")
loss1.backward(retain_graph=True) # Must retain graph for loss2
print_gpu_memory()
# Wait and check if memory changes
print("Waiting 2 seconds...")
time.sleep(2)
print_gpu_memory()
elif case == 2:
# Case 2: Call loss1.backward() then delete loss2
print("Calling loss1.backward()...")
loss1.backward(retain_graph=True)
print_gpu_memory()
print("Deleting loss2...")
del loss2
torch.cuda.empty_cache()
gc.collect()
print_gpu_memory()
elif case == 3:
# Case 3: Call loss1.backward() then loss2.backward()
print("Calling loss1.backward()...")
loss1.backward(retain_graph=True)
print_gpu_memory()
print("Calling loss2.backward()...")
loss2.backward()
print_gpu_memory()
elif case == 4:
# Case 4: Call loss1.backward() with retain_graph=False (should error)
try:
print("Calling loss1.backward() without retaining graph...")
loss1.backward() # No retain_graph
print("Trying to call loss2.backward()...")
loss2.backward() # This should error
except RuntimeError as e:
print(f"Expected error: {e}")
print_gpu_memory()
# Cleanup
del a, loss1
if case != 2 and case != 4: # In case 2 & 4, loss2 is already deleted
del loss2
torch.cuda.empty_cache()
gc.collect()
print("After cleanup:")
print_gpu_memory()
if __name__ == "__main__":
if not torch.cuda.is_available():
print("CUDA not available. Please run on a GPU-enabled system.")
exit()
print("Testing memory behavior in PyTorch with multiple losses sharing an input")
# Test all cases
for case in range(1, 5):
test_memory_behavior(case)
Thanks, this helps a lot!
You might also be interested in Understanding GPU Memory 1: Visualizing All Allocations over Time | PyTorch
I was able to get case 4 to work.
Adding another intermediate node to the graph gives us the expected error:
b = a * 2
loss1 = b.sum() * 2
loss2 = b.mean() * 3