OutOfMemoryError on A100 GPU

I am training a dCNN with 7 hidden layers for denoising of medical images. I use the LoDoPaB-CT Dataset but only a small fraction (3553 samples, each of size 362x362) of the original dataset, both of the prepared float32-tensors (for samples and labels) are about 3.5GB in size. After creating the data loaders and starting training with batch-size 64, I get the error:

OutOfMemoryError: CUDA out of memory. Tried to allocate 2.97 GiB (GPU 0; 39.56 GiB total capacity; 31.23 GiB already allocated; 912.56 MiB free; 37.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In my opinion, it is not feasible that the dataset nor the model is “too large” to train on such a powerful GPU with almost 40GB of memory.
Is there a memory leak? Is the batch size still too large for images of size 362x362? I am unsure what I can try to debug this issue. I’d be glad for any help I can get. Below is the network architecture, data preparation and training loop, incase that helps fix the issue.
Thanks already in advance :slight_smile:

samples_tens = torch.load("/path/samples_tens.pt")
labels_tens = torch.load("/path/labels_tens.pt")

print(samples_tens.dtype, labels_tens.dtype)

# Split into training and validation:
test_size = 0.2
random_state = 42

obs_train, obs_test, labels_train, labels_test = train_test_split(
    samples_tens, labels_tens, test_size=test_size, random_state=random_state
)

# Create Datasets with tensors:
train_dataset = TensorDataset(obs_train, labels_train)
test_dataset = TensorDataset(obs_test, labels_test)
# test_dataset = train_dataset

# Create DataLoaders:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, shuffle=False)
def __init__(self, in_channels, out_channels):
        super(DilationCNN, self).__init__()

        self.conv1 = nn.Sequential(
            nn.Conv2d(in_channels, 32, kernel_size=5, padding=1),
            nn.ReLU()
        )

        self.dil_conv1 = nn.Sequential(
            nn.Conv2d(32, 64, kernel_size=5, padding=2, dilation=2),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )

        self.dil_conv2 = nn.Sequential(
            nn.Conv2d(64, 128, kernel_size=5, padding=3, dilation=3),
            nn.BatchNorm2d(128),
            nn.ReLU()
        )

        self.dil_conv3 = nn.Sequential(
            nn.Conv2d(128, 128, kernel_size=5, padding=4, dilation=4),
            nn.BatchNorm2d(128),
            nn.ReLU()
        )

        self.dil_conv4 = nn.Sequential(
            nn.Conv2d(128, 64, kernel_size=3, padding=10, dilation=3),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )

        self.dil_conv5 = nn.Sequential(
            nn.Conv2d(128, 64, kernel_size=5, padding=6, dilation=2),
            nn.BatchNorm2d(64),
            nn.ReLU()
        )


        self.conv2 = nn.Sequential(
            nn.Conv2d(96, out_channels, kernel_size=5, padding=3),
            nn.ReLU()
        )
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')

model = DilationCNN(in_channels=1, out_channels=1).to(device)

criterion = nn.MSELoss()
optim_ADAM = torch.optim.Adam(model.parameters())
optim_SGD = torch.optim.SGD(model.parameters(), lr=learning_rate)
n_adam_eps = math.floor(n_epochs * swtich_perc)
torch.autograd.set_detect_anomaly(True)
loss_hist = []
psnr_hist = []

for epoch in range(n_epochs):
    model.train()
    if epoch <= n_adam_eps:
        optimizer = optim_ADAM
    else:
        optimizer = optim_SGD

    for batch_idx, (samples, labels) in enumerate(tqdm(train_dataloader, desc="Going over batches...")):
        samples = samples.unsqueeze(1).to(device)
        labels = labels.to(device)

        outputs = model(samples)
        loss = criterion(outputs, labels.unsqueeze(1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        print(f"Epoch [{epoch+1}/{n_epochs}], Batch [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
        loss_hist.append(loss.item())

    model.eval()
    total_psnr = 0.0

    with torch.no_grad():
        for samples, labels in test_dataloader:

            samples = samples.unsqueeze(1).to(device)
            labels = labels.unsqueeze(1).to(device)

            outputs = model(samples)

            mse = criterion(outputs, samples)
            psnr = 10 * torch.log10(1 / mse)
            total_psnr += psnr.item()

    average_psnr = total_psnr / len(test_dataloader)
    print(f"---|Epoch [{epoch+1}/{n_epochs}], Average PSNR: {average_psnr:.4f}")
    psnr_hist.append(average_psnr)

    torch.save(model.state_dict(), f'/path/model_1_ep{epoch+1}_{average_psnr}.pt')

1 Like

Why do think it’s not possible?
The memory usage is not only defined by the inputs and parameters of the model, but also by the intermediate forward actiations. Especially conv layers use often a tiny filter kernel creating potentially huge outputs which need to be stored for the gradient calculation.

This post gives you a simple example.