The following minimal code prints the GPU memory usage in bytes after each evaluation of the network.
The GPU memory usage increases at every iteration at an alarming rate. System RAM usage stays constant.
I have two questions: why there is a memory leak in the first place? I’ve set torch.no_grad()
(and self.eval()
, but this shouldn’t matter) so I’m pretty sure no additional gradient information is saved anywhere.
For the second question: why the leak happens on the GPU memory? I’ve explicitely set x.to(device_local)
so even if there is still graph or activations information stored in the output tensor it should cause an increase in system RAM, not on the GPU, I think.
import torch
import torch.nn as nn
import time
torch.manual_seed(0)
torch.no_grad()
input_size = 2048
device_fast = 'cuda:0'
device_local = 'cpu'
class toy_model(nn.Module):
def __init__(self):
super(toy_model, self).__init__()
self.conv1 = nn.Conv2d(1, 2**5, kernel_size=3, padding=1, stride=2)
self.conv2 = nn.Conv2d(2**5, 2**5, kernel_size=3, padding=1, stride=2)
self.conv3 = nn.Conv2d(2**5, 2**5, kernel_size=3, padding=1, stride=2)
self.conv4 = nn.Conv2d(2**5, 2**5, kernel_size=3, padding=1, stride=2)
self.conv5 = nn.Conv2d(2**5, 2**5, kernel_size=3, padding=1, stride=2)
self.conv6 = nn.Conv2d(2**5, 2**5, kernel_size=3, padding=1, stride=2)
self.conv7 = nn.Conv2d(2**5, 2**5, kernel_size=3, padding=1, stride=2)
self.conv8 = nn.Conv2d(2**5, 2**5, kernel_size=3, padding=1, stride=2)
self.conv9 = nn.Conv2d(2**5, 2**4, kernel_size=3, padding=1, stride=2)
self.linear1 = nn.Linear(2**4 * (input_size//(2**9))**2, 1)
def forward(self):
self.eval()
x = torch.rand(1, 1, input_size, input_size).to(device_fast)
x = torch.relu(self.conv1(x))
x = torch.relu(self.conv2(x))
x = torch.relu(self.conv3(x))
x = torch.relu(self.conv4(x))
x = torch.relu(self.conv5(x))
x = torch.relu(self.conv6(x))
x = torch.relu(self.conv7(x))
x = torch.relu(self.conv8(x))
x = torch.relu(self.conv9(x))
x = torch.flatten(x, 1)
x = torch.sigmoid(self.linear1(x));
return x.to(device_local)
toy = toy_model().to(device_fast)
estimates = []
for iter in range(100):
estimates.append( toy() )
torch.cuda.empty_cache()
print(iter, torch.cuda.max_memory_allocated(device_fast))
time.sleep(0.5)
print("Finished")
print(len(estimates))