Pytorch backward and memory release question

If a input a is use in two loss: loss1 and loss2, the “loss1.backward()” will release input gpu memory? If not, then I run “del loss2” afterwards, will the memory of the input be released? Or I must run “loss2.backward()” to release the memory of input?

I think the below bit of code is a nice explanation to your question and surrounding use cases.

import torch
import gc
import time

def print_gpu_memory():
    """Print GPU memory usage."""
    if torch.cuda.is_available():
        print(f"GPU memory allocated: {torch.cuda.memory_allocated() / 1024**2:.2f} MB")
        print(f"GPU memory reserved: {torch.cuda.memory_reserved() / 1024**2:.2f} MB")
    else:
        print("CUDA not available")

def test_memory_behavior(case):
    """Test memory behavior for different scenarios."""
    print(f"\n==== Testing Case {case} ====")
    
    # Clear memory
    torch.cuda.empty_cache()
    gc.collect()
    print_gpu_memory()
    
    # Create a large tensor with requires_grad=True
    print("Creating input tensor...")
    a = torch.randn(5000, 5000, device='cuda', requires_grad=True)
    print_gpu_memory()
    
    # Create two losses using the same input
    print("Creating two losses...")
    loss1 = a.sum() * 2
    loss2 = a.mean() * 3
    print_gpu_memory()
    
    if case == 1:
        # Case 1: Just call loss1.backward() and check memory
        print("Calling loss1.backward()...")
        loss1.backward(retain_graph=True)  # Must retain graph for loss2
        print_gpu_memory()
        
        # Wait and check if memory changes
        print("Waiting 2 seconds...")
        time.sleep(2)
        print_gpu_memory()
        
    elif case == 2:
        # Case 2: Call loss1.backward() then delete loss2
        print("Calling loss1.backward()...")
        loss1.backward(retain_graph=True)
        print_gpu_memory()
        
        print("Deleting loss2...")
        del loss2
        torch.cuda.empty_cache()
        gc.collect()
        print_gpu_memory()
        
    elif case == 3:
        # Case 3: Call loss1.backward() then loss2.backward()
        print("Calling loss1.backward()...")
        loss1.backward(retain_graph=True)
        print_gpu_memory()
        
        print("Calling loss2.backward()...")
        loss2.backward()
        print_gpu_memory()
        
    elif case == 4:
        # Case 4: Call loss1.backward() with retain_graph=False (should error)
        try:
            print("Calling loss1.backward() without retaining graph...")
            loss1.backward()  # No retain_graph
            print("Trying to call loss2.backward()...")
            loss2.backward()  # This should error
        except RuntimeError as e:
            print(f"Expected error: {e}")
        print_gpu_memory()
    
    # Cleanup
    del a, loss1
    if case != 2 and case != 4:  # In case 2 & 4, loss2 is already deleted
        del loss2
    torch.cuda.empty_cache()
    gc.collect()
    print("After cleanup:")
    print_gpu_memory()

if __name__ == "__main__":
    if not torch.cuda.is_available():
        print("CUDA not available. Please run on a GPU-enabled system.")
        exit()
        
    print("Testing memory behavior in PyTorch with multiple losses sharing an input")
    
    # Test all cases
    for case in range(1, 5):
        test_memory_behavior(case)

Thanks, this helps a lot!

You might also be interested in Understanding GPU Memory 1: Visualizing All Allocations over Time | PyTorch

I was able to get case 4 to work.

Adding another intermediate node to the graph gives us the expected error:

b = a * 2
loss1 = b.sum() * 2
loss2 = b.mean() * 3