I run out of GPU memory when training my model. The leak seems to be happening at the first call of loss.backward(). I guess that somehow a copy of the graph remain in the memory but can’t see where it happens and what to do about it.
Here’s my fit function:
val_loss_best = np.inf
losses = []
# Prepare loss history
for epoch in range(epochs):
for idx_batch, (x, y) in enumerate(dataloader_train):
optimizer.zero_grad()
# Propagate input
mem_gpu = torch.cuda.memory_stats(device='cuda:0')["allocated_bytes.all.current"]
print(f'Before forward: {mem_gpu:,}')
netout = net(x.to(device), y.to(device))
mem_gpu = torch.cuda.memory_stats(device='cuda:0')["allocated_bytes.all.current"]
print(f"After forward: {mem_gpu:,}")
# Compute loss on the training set
loss = loss_function(netout, y.to(device))
mem_gpu = torch.cuda.memory_stats(device='cuda:0')["allocated_bytes.all.current"]
print(f'After loss calc: {mem_gpu:,}')
# Backpropage loss
loss.backward()
mem_gpu = torch.cuda.memory_stats(device='cuda:0')["allocated_bytes.all.current"]
print(f'After backward: {mem_gpu:,}')
# Update weights
optimizer.step()
# Compute loss on the validation set
net.eval()
mem_gpu = torch.cuda.memory_stats(device='cuda:0')["allocated_bytes.all.current"]
print(f'After net.eval(): {mem_gpu:,}')
# mem_gpu = torch.cuda.memory_stats(device='cuda:0')["allocated_bytes.all.current"]
# print(f'After torch.no_grad(): {mem_gpu:,}')
val_loss = compute_loss(net, dataloader_val, loss_function, device) #float
mem_gpu = torch.cuda.memory_stats(device='cuda:0')["allocated_bytes.all.current"]
print(f'After val loss calc: {mem_gpu:,}')
losses.append(val_loss)
if val_loss < val_loss_best:
val_loss_best = val_loss
net.train()
return val_loss_best
def compute_loss(net: torch.nn.Module,
dataloader: torch.utils.data.DataLoader,
loss_function: torch.nn.Module,
device: torch.device = 'cpu') -> float:
running_loss = 0
with torch.no_grad():
for idx_batch, (x, y) in enumerate(dataloader): #iterate across batches
netout = net(x.to(device))
current_loss = loss_function(y.to(device), netout).item()
running_loss += current_loss
return running_loss / len(dataloader)
and loss_function is nn.MSE().
The output of this code is:
Before I call fit():
Before initialising model: 534,819,328
After initialising model: 635,895,808
Now we go into fit():
Before forward: 635,895,808
After forward: 647,901,184
After loss calc: 647,902,208
After backward: 711,877,120
Before forward: 787,703,808
After forward: 799,555,072
After loss calc: 799,555,584
After backward: 863,530,496
Before forward: 787,703,808
After forward: 799,555,072
After loss calc: 799,555,584
After backward: 863,530,496
… (these four lines will repeat until we exit from fit()).
Then, After moving model to cpu: 686,627,328
So I’m losing 150MB of GPU memory. Any ideas, please help!