Hello I have been trying to get this model to run on a computer vision task and keep getting the usual out-of-memory error. The GPU memory always fills up at the 6th epoch no matter the batch_size value or whatever else I try. What I tried:
- using del outputs, loss
- loss.detach().item()
- gc.collect() and torch.cuda.empty_cache()
None of this works,
My training loop definition
def train_model(model, optimizer, criterion=loss_func, metric=dice_metric, n_epochs=20, batch_size=BATCH_SIZE):
model.to(DEVICE)
# Defining optimizer, loss, and dataloader
train_set = HubMAPDataset(df=train_df, fold=fold, train=True)
val_set = HubMAPDataset(df=train_df, fold=fold, train=False)
train_loader = DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_set, batch_size=batch_size)
optimizer = optimizer([
{'params': model.encoder.parameters(), 'lr': 8e-5},
{'params': model.decoder.parameters(), 'lr': 5e-5}
])
scheduler = OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,
max_lr=1e-3, epochs=n_epochs, steps_per_epoch=len(train_loader))
result = None
best_score = 0
val_scores, train_scores, val_losses, train_losses = [], [], [], []
best_val_epoch = -1
print(f"Starting Training")
for epoch in range(n_epochs):
########################
# TRAINING #
########################
gc.collect()
torch.cuda.empty_cache()
model.train()
epoch_loss, epoch_score = 0, 0
t = tqdm(train_loader, leave=False)
for images, labels in t:
#####################################
## EPOCH ##
#####################################
optimizer.zero_grad()
images, labels = images.to(DEVICE), labels.to(DEVICE)
outputs = model(images)
loss = criterion(outputs, labels)
epoch_score += metric(outputs, labels)
epoch_loss += loss.detach().item()
loss.backward()
del loss, outputs
optimizer.step()
scheduler.step()
# Statistics Recording
epoch_loss /= len(train_loader)
epoch_score /= len(train_loader)
train_losses.append(epoch_loss)
train_scores.append(epoch_score)
if epoch%5 != 0:
print(f"FOLD: {fold}, EPOCH: {epoch + 1}, train_loss: {epoch_loss} , training dice: {epoch_score}")
#######################################
### VALIDATION ###
#######################################
if epoch%4 == 0 and epoch != 0:
model.eval()
with torch.no_grad():
valid_loss, val_score = 0, 0
t_val = tqdm(val_loader)
for val_images, val_labels in t_val:
val_images, val_labels = val_images.to(DEVICE), val_labels.to(DEVICE)
outputs = model(val_images)
val_score += metric(outputs, val_labels)
#val_loss += loss_fn(outputs, val_labels)
val_score /= len(val_loader)
val_scores.append(val_score)
if val_score > best_score:
best_score = val_score
torch.save(model.state_dict(), f"{MODEL_NAME}_{ENCODER}-{IMG_SIZE}x{IMG_SIZE}_BestBaseline_{fold}_bsize-{BATCH_SIZE}.pth")
print(f"Saving model with best val score : {MODEL_NAME}_{ENCODER}-{IMG_SIZE}x{IMG_SIZE}_BestBaseline_{fold}_bsize-{BATCH_SIZE}.pth")
print(f"FOLD: {fold}, EPOCH: {epoch + 1}")
print(f"{'#'*30} Validation {'#'*100}")
print(f"{'#'*30} Train_loss: {epoch_loss} , Train_dice: {epoch_score}, Val dice: {val_score} {'#'*25}")
print(f"Memory cached in GPU: {torch.cuda.memory_cached()}")
I tracked memory cached in GPU for every epoch. I have 16GB of GPU memory
Memory cached in GPU: 5486149632
Memory cached in GPU: 7537164288
Memory cached in GPU: 9479127040
Memory cached in GPU: 11607736320
Memory cached in GPU: 13600030720
Memory cached in GPU: 15669919744
Then the error
RuntimeError: CUDA out of memory. Tried to allocate 72.00 MiB (GPU 0; 15.90 GiB total capacity; 12.07 GiB already allocated; 35.75 MiB free; 15.10 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
Any suggestion would be greatly appreciated.