I am training a dCNN with 7 hidden layers for denoising of medical images. I use the LoDoPaB-CT Dataset but only a small fraction (3553 samples, each of size 362x362) of the original dataset, both of the prepared float32-tensors (for samples and labels) are about 3.5GB in size. After creating the data loaders and starting training with batch-size 64, I get the error:
OutOfMemoryError: CUDA out of memory. Tried to allocate 2.97 GiB (GPU 0; 39.56 GiB total capacity; 31.23 GiB already allocated; 912.56 MiB free; 37.06 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
In my opinion, it is not feasible that the dataset nor the model is “too large” to train on such a powerful GPU with almost 40GB of memory.
Is there a memory leak? Is the batch size still too large for images of size 362x362? I am unsure what I can try to debug this issue. I’d be glad for any help I can get. Below is the network architecture, data preparation and training loop, incase that helps fix the issue.
Thanks already in advance
samples_tens = torch.load("/path/samples_tens.pt")
labels_tens = torch.load("/path/labels_tens.pt")
print(samples_tens.dtype, labels_tens.dtype)
# Split into training and validation:
test_size = 0.2
random_state = 42
obs_train, obs_test, labels_train, labels_test = train_test_split(
samples_tens, labels_tens, test_size=test_size, random_state=random_state
)
# Create Datasets with tensors:
train_dataset = TensorDataset(obs_train, labels_train)
test_dataset = TensorDataset(obs_test, labels_test)
# test_dataset = train_dataset
# Create DataLoaders:
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, shuffle=False)
def __init__(self, in_channels, out_channels):
super(DilationCNN, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(in_channels, 32, kernel_size=5, padding=1),
nn.ReLU()
)
self.dil_conv1 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=5, padding=2, dilation=2),
nn.BatchNorm2d(64),
nn.ReLU()
)
self.dil_conv2 = nn.Sequential(
nn.Conv2d(64, 128, kernel_size=5, padding=3, dilation=3),
nn.BatchNorm2d(128),
nn.ReLU()
)
self.dil_conv3 = nn.Sequential(
nn.Conv2d(128, 128, kernel_size=5, padding=4, dilation=4),
nn.BatchNorm2d(128),
nn.ReLU()
)
self.dil_conv4 = nn.Sequential(
nn.Conv2d(128, 64, kernel_size=3, padding=10, dilation=3),
nn.BatchNorm2d(64),
nn.ReLU()
)
self.dil_conv5 = nn.Sequential(
nn.Conv2d(128, 64, kernel_size=5, padding=6, dilation=2),
nn.BatchNorm2d(64),
nn.ReLU()
)
self.conv2 = nn.Sequential(
nn.Conv2d(96, out_channels, kernel_size=5, padding=3),
nn.ReLU()
)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f'Device: {device}')
model = DilationCNN(in_channels=1, out_channels=1).to(device)
criterion = nn.MSELoss()
optim_ADAM = torch.optim.Adam(model.parameters())
optim_SGD = torch.optim.SGD(model.parameters(), lr=learning_rate)
n_adam_eps = math.floor(n_epochs * swtich_perc)
torch.autograd.set_detect_anomaly(True)
loss_hist = []
psnr_hist = []
for epoch in range(n_epochs):
model.train()
if epoch <= n_adam_eps:
optimizer = optim_ADAM
else:
optimizer = optim_SGD
for batch_idx, (samples, labels) in enumerate(tqdm(train_dataloader, desc="Going over batches...")):
samples = samples.unsqueeze(1).to(device)
labels = labels.to(device)
outputs = model(samples)
loss = criterion(outputs, labels.unsqueeze(1))
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f"Epoch [{epoch+1}/{n_epochs}], Batch [{batch_idx+1}/{len(train_dataloader)}], Loss: {loss.item():.4f}")
loss_hist.append(loss.item())
model.eval()
total_psnr = 0.0
with torch.no_grad():
for samples, labels in test_dataloader:
samples = samples.unsqueeze(1).to(device)
labels = labels.unsqueeze(1).to(device)
outputs = model(samples)
mse = criterion(outputs, samples)
psnr = 10 * torch.log10(1 / mse)
total_psnr += psnr.item()
average_psnr = total_psnr / len(test_dataloader)
print(f"---|Epoch [{epoch+1}/{n_epochs}], Average PSNR: {average_psnr:.4f}")
psnr_hist.append(average_psnr)
torch.save(model.state_dict(), f'/path/model_1_ep{epoch+1}_{average_psnr}.pt')