I am facing the same problem, with memory increasing each epoch during training (high increase, even after the first epoch, almost monotonically).
Is it possible to be the ‘scaler’? I tried removing ‘schedule’. This code works with other models, is it possible to be the model responsible for high GPU RAM increasing after the first epoch?
import torch
import os
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torchvision.transforms.functional as tf
import numpy as np
import pandas as pd
from model import UResNet50
from utils import *
import time
# Defining Hyperparameters
learning_rate = 1e-3 # learning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 1 # batch size
num_epochs = 60 # number of epochs
num_workers = 2 # number of workers
clip_train = 0.05 # percentage to clip the train dataset (for tests)
clip_valid = 0.05 # percentage to clip the valid dataset (for tests)
valid_percent = 0.2 # use a percent. of train dataset as validation dataset
start_save = 10 # epoch to start saving
image_height = 224 # height to crop the image
image_width = 224 # width to crop the image
pin_memory = True
load_model = False # 'true' to load a model and test it, or use it
save_model = True # 'true' to save model trained after epoches
continue_training = False # 'true' to load and continue training a model
save_images = False # saving example from predicted and original
change_last_fc = False # to change the last fully connected layer
test_models = False # 'true' to test the models saved in 'save_results_dir'
last_epoch = 0 # when 'continue_training', write the n° of last epoch
train_image_dir = ['G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Basophil',
'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Eosinophil',
'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Lymphocyte',
'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Monocyte',
'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Neutrophil']
val_image_dir = None
# directory to save the results
save_results_dir = 'C:/Users/marlo/My Drive/College/Biophotonics Lab/Research/Programs/Python/Camera & Image/Microscópio Veterinário/Atuais/ResNet em U'
# directory to load models to test, normally equal to 'save_results_dir'
test_models_dir = 'C:/Users/marlo/My Drive/College/Biophotonics Lab/Research/Programs/Python/Camera & Image/Microscópio Veterinário/Atuais/ResNet em U'
def train_fn(loader, model, optimizer, loss_fn, scaler, schedule, epoch, last_lr):
loop = tqdm(loader, desc='Epoch '+str(epoch+1))
for batch_idx, (dictionary) in enumerate(loop):
image, label = dictionary
x, y = dictionary[image], dictionary[label]
x, y = x.to(device=device), y.to(device=device)
y = y.float()
# Forward
with torch.cuda.amp.autocast() if torch.cuda.is_available() else torch.autocast('cpu'):
pred = model(x)
y = tf.center_crop(y, pred.shape[2:])
loss = loss_fn(pred, y)
# Backward
optimizer.zero_grad()
if device == 'cuda':
scaler.scale(loss).backward()
scaler.step(optimizer)
scale = scaler.get_scale()
scaler.update()
else:
# if device='cpu', we cannot use 'scaler=torch.cuda.amp.GradScaler()'
loss.backward()
optimizer.step()
# deliting loss, prediction, x, and y
loss_item = loss.item()
del loss, pred, y, x, image, label, dictionary
#updating tgdm loop
loop.set_postfix(loss=loss_item)
# deliting loader and loop
del loader, loop
#scheduling learning rate
if scaler:
if scale >= scaler.get_scale():
schedule.step()
last_lr = schedule.get_last_lr()
else:
schedule.step()
last_lr = schedule.get_last_lr()
return loss_item, last_lr
def main():
model = UResNet50(in_channels=3, num_classes=3).to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
schedule = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
train_loader, valid_loader = get_loaders(
train_image_dir=train_image_dir,
valid_percent=valid_percent,
batch_size=batch_size,
image_height=image_height,
image_width=image_width,
num_workers=num_workers,
pin_memory=pin_memory,
val_image_dir=val_image_dir,
clip_valid=clip_valid,
clip_train=clip_train
)
if load_model:
# Loading checkpoint, if 'cpu', we need to pass 'map_location'
os.chdir(root_folder)
if device == 'cuda':
load_checkpoint(torch.load('my_checkpoint.pth.tar'), model)
else:
load_checkpoint(torch.load('my_checkpoint.pth.tar',
map_location=torch.device('cpu')), model)
check_accuracy(valid_loader, model, loss_fn, device=device)
if not load_model or continue_training:
# Changing folder to save dictionary
os.chdir(save_results_dir)
# If 'continue_training = True', we load the model and continue training
if continue_training:
print('- Continue Training...\n')
start = time.time()
if device == 'cuda':
load_checkpoint(torch.load('my_checkpoint.pth.tar'), model,
optimizer=optimizer)
else:
load_checkpoint(torch.load('my_checkpoint.pth.tar',
map_location=torch.device('cpu')),
model, optimizer=optimizer)
# reading the csv 'dictionary.csv' as a dictionary
df = pd.read_csv('dictionary.csv')
temp = df.to_dict('split')
temp = temp['data']
dictionary = {'acc':[], 'loss':[], 'dice score':[], 'time taken':[]}
for acc, loss, dice_score, time_item in temp:
dictionary['acc'].append(acc)
dictionary['loss'].append(loss)
dictionary['dice score'].append(dice_score)
dictionary['time taken'].append(time_item)
# if change the last fully-connected layer:
if change_last_fc == True:
print('yess changes')
model.fc = nn.Linear(21, 5)
model.cuda()
elif not continue_training:
print('- Start Training...\n')
start = time.time()
# Opening a 'loss' and 'acc' list, to save the data
dictionary = {'acc':[], 'loss':[], 'dice score':[], 'time taken':[]}
acc_item, loss_item, dice_score = check_accuracy(valid_loader, model, loss_fn, device=device)
dictionary['acc'].append(acc_item)
dictionary['loss'].append(loss_item)
dictionary['dice score'].append(dice_score)
dictionary['time taken'].append((time.time()-start)/60)
if device == 'cuda':
scaler = torch.cuda.amp.GradScaler()
else:
# with 'cpu' we can't use cuda.amp.GradScaler(), we only use autograd
scaler = None
# to use 'last_lr' in 'train_fn', we have to define it first
last_lr = schedule.get_last_lr()
# running epochs for
for epoch in range(num_epochs):
loss_item, last_lr = train_fn(train_loader, model, optimizer,
loss_fn, scaler, schedule, epoch,
last_lr)
dictionary['loss'].append(loss_item)
# save model
if save_model and epoch >= start_save:
checkpoint = {
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict(),
}
save_checkpoint(checkpoint, filename='my_checkpoint'+str(epoch+1+last_epoch)+'.pth.tar')
# check accuracy
acc_item, temp, dice_score = check_accuracy(valid_loader, model, loss_fn, device=device)
stop = time.time()
dictionary['acc'].append(acc_item)
dictionary['dice score'].append(dice_score)
dictionary['time taken'].append((stop-start)/60)
# print some examples to folder
if save_images:
save_predictions_as_imgs(
valid_loader, model, folder=os.path.join(root_folder,'saved_images'),
device=device
)
# saving dictionary to csv file
if save_model:
df = pd.DataFrame(dictionary, columns = ['acc', 'loss', 'dice score', 'time taken'])
df.to_csv('dictionary.csv', index = False)
print('- Time taken:',round((stop-start)/60,3),'min')
print('- Last Learning rate:', round(last_lr[0],8),'\n')
plt.subplots()
plt.plot(np.asarray(dictionary['acc'])/100, label ='accuracy')
plt.plot(np.asarray(dictionary['loss'])/100, label = 'loss')
plt.plot(np.asarray(dictionary['dice score'])/100, label = 'dice score')
plt.legend()
plt.xlabel('Epochs')
plt.ylabel('Accuracy, Loss, and Dice score')
plt.show()
if __name__ == '__main__':
main()