GPU memory consumption increases while training

@apaszke @smth Would you please give some advice on this problem? It seems that you have a good knowledge at Pytorch. THANKS VERY MUCH!!!

If you add del loss, output at the end of the loop the memory usage will likely remain the same after the first iteration (what you see it probably a side-effect of Python’s scoping rules). It’s possible that cuDNN uses much less memory than the default backend.

17 Likes

Wow, it really works. Thank you very much. There are so many things for me to learn python and pytorch.:smiley:

When running a tiramisu model, I found that the gpu use was 4.5 GB during 1st epoch and during the 2nd it shot up. Based on your comment I did the following

out = model(input)
loss = crite(out, labels)

loss.backward()
del loss
del out

And thankfully I saw my network using the same amount of GPU in the consecutive epochs

Oh…, It really helps. TKS!

Hi @apaszke, I tried your solution but it doesn’t solve my problem. See here about my situation. Really appreciate your help, thanks!

Answered in the thread.

Hello,

I have the same problem as my GPU memory usage increases on the nvidia-smi output while torch.cuda.memory_allocated('cuda:0') always outputs the same value.

I already use del loss and detach_() some tensor after the backward() call.

Still I never get OOM error as my training always reaches its end.

Thank you

Could you post a code snippet to reproduce this issue please?
Based on the description it seems as if memory is really leaked (not just increased to to e.g. storing the computation graph).

if use_gpu:
    if torch.cuda.is_available():
        torch.backends.cudnn.enabled = True
        torch.backends.cudnn.benchmark = True
        retinanet = retinanet.cuda()

if torch.cuda.is_available():
    retinanet = torch.nn.DataParallel(retinanet).cuda()
else:
    retinanet = torch.nn.DataParallel(retinanet)

for epoch_num in range(parser.epochs):

    train_loss = train(dataloader_train, retinanet, optimizer, writer, epoch_num, train_hist)

    val_loss = eval(dataloader_val, retinanet, writer, epoch_num, val_hist)

    AP_eval = csv_eval.evaluation(dataset_val, retinanet)

def train(dataloader_train, model, optimizer, writer, epoch, train_hist):

    print("Train")

    model.train()

    model.module.set_compute(True)

    epoch_loss = []

    for iter_num, data in enumerate(dataloader_train):
        try:
            optimizer.zero_grad()

            if torch.cuda.is_available():
                classification_loss, regression_loss = model([data['img'].cuda().float(), data['annot']])
            else:
                classification_loss, regression_loss = model([data['img'].float(), data['annot']])

            classification_loss = classification_loss.mean()
            regression_loss = regression_loss.mean()
        
            loss = classification_loss + regression_loss

            if bool(loss == 0):
                continue

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)

            optimizer.step()

            classification_loss.detach_()
            regression_loss.detach_()
            loss.detach_()

            if parser.debug != 1:
                writer.add_scalar("ClassLoss/Train", classification_loss.cpu(), (iter_num + len(dataloader_train) * epoch))
                writer.add_scalar("RegLoss/Train", regression_loss.cpu(), (iter_num + len(dataloader_train) * epoch))
                writer.add_scalar("Total/Train", loss.cpu(), (iter_num + len(dataloader_train) * epoch))

            train_hist.append(float(loss.cpu()))

            epoch_loss.append(float(loss.cpu()))

            if iter_num % int(len(dataloader_train)/10) == 0:

                print(
                'Epoch: {:3d}/{:3d} | Iteration: {:4d}/{:4d} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Epoch loss: {:1.5f} | Average loss: {:1.5f}'.format(
                    epoch, parser.epochs,  iter_num, len(dataloader_train), float(classification_loss.cpu()), float(regression_loss.cpu()), np.mean(epoch_loss), np.mean(train_hist)))

            del classification_loss
            del regression_loss
            del loss
        except Exception as e:
            print(e)
            continue

    if parser.debug != 1:
        writer.add_scalar("EpochLoss/Train", np.mean(epoch_loss), epoch)

    return np.mean(epoch_loss)

 def eval(dataloader_val, model, writer, epoch, val_hist):

    print("Eval")

    model.eval()
    model.module.freeze_bn()
    model.module.set_compute(True)

    epoch_loss = []

    with torch.no_grad():
        for iter_num, data in enumerate(dataloader_val):
            try:
                if torch.cuda.is_available():
                    classification_loss, regression_loss = model([data['img'].cuda().float(), data['annot']])
                else:
                    classification_loss, regression_loss = model([data['img'].float(), data['annot']])

                classification_loss = classification_loss.mean().cpu()
                regression_loss = regression_loss.mean().cpu()

                loss = classification_loss + regression_loss

                if bool(loss == 0):
                    continue

                if parser.debug != 1:
                    writer.add_scalar("ClassLoss/Eval", classification_loss, (iter_num + len(dataloader_val) * epoch))
                    writer.add_scalar("RegLoss/Eval", regression_loss, (iter_num + len(dataloader_val) * epoch))
                    writer.add_scalar("Total/Eval", loss, (iter_num + len(dataloader_val) * epoch))

                val_hist.append(float(loss))
                epoch_loss.append(float(loss))

                if iter_num % int(len(dataloader_val)/10) == 0:

                    print('Epoch: {:3d}/{:3d} | Iteration: {:4d}/{:4d} | Classification loss: {:1.5f} | Regression loss: {:1.5f} | Epoch loss: {:1.5f} | Average loss: {:1.5f}'.format(epoch, parser.epochs,  iter_num, len(dataloader_val), float(classification_loss), float(regression_loss), np.mean(epoch_loss), np.mean(val_hist)))
                
                del classification_loss
                del regression_loss
                del loss

            except Exception as e:
                print(e)
                continue

    if parser.debug != 1:
        writer.add_scalar("EpochLoss/Eval", np.mean(epoch_loss), epoch)

    return np.mean(epoch_loss)

Here is my snippet code.

csv_eval.evaluation(dataset_val, retinanet)

Is just inference to compute the mAP.

What I have observed is that during the first training epoch the nvidia-smi memory usage increases a bit, we have a low value during the eval epoch and a very low during inference (normal behaviour). Second training epoch the memory still increases but what is surprising is during the eval and inference the memory usage stays at the same level as during training.

After 2-3 epochs it stays around the same value whatever the train-eval-inference phase.

In that case I might have misunderstood the issue and thought the memory would increase in each eproch.
If you are seeing some memory increase in the first epochs, this might be due to memory fragmentation. E.g. if you are finishing the first training and validation run, PyTorch might free some intermediate tensors, which are not referenced anymore. The next memory allocation might not fit in the freed blocks, so that new memory has to be allocated.

How large is the memory increase after the first, second and third epoch?

I just made a test now.
Epoch 1:
train iter 1: 6139M
train iter last: 7209M
eval iter 1: 3291M
eval iter last: 3291M
inference iter 1: 3291M
inference iter last: 3291M
Epoch 2:
train iter 1: 6409M
train iter last: 7477M
eval iter 1: 7479M
eval iter last: 7479M
inference iter 1: 7479M
inference iter last: 7479M
Epoch 3:
train iter 1: 7479M
train iter last: 7479M
eval iter 1: 7479M
eval iter last: 7479M
inference iter 1: 7479M
inference iter last: 7479M

Maybe this is normal in which case i learned something new. My guess is that there is a “knowledge” of the pattern train>eval>inference and the gpu stores in memory the computational graph even if we are in a torch.no_grad() env for the next epochs?

Note that PyTorch tries to reuse the cached memory in order to avoid cudaMalloc calls.
If you are measuring the memory usage via nvidia-smi only, you’ll see the overall used memory (allocated + cached + CUDA context + other processes).

You could check the allocated memory via torch.cuda.memory_allocated(), which would most likely go down during evaluation, if you’ve properly freed the training data.

1 Like

The behaviour follows what you said. Thank you for the explanation!

I faced a similar issue. My issue turned out to be I was collecting validation stats across batches, but this was all happening on the GPU device side. I solved the problem by detaching from the GPU and sending my collected validation data to the CPU.

I’m sorry for this delayed reply on this thread. I arrived at this issue a couple of moments ago and your answered solved it for me. But I’m not entirely sure why. Can you direct me to some resource where I can read more about this (particularly about the cause)?

GPU memory consumption increases a lot at the first several iterations while training.

I am also facing this issue. I tried the solution

But this did not have noticeable impact. My training loop is as follows:

def run(self):
    pb = tqdm(
        total=self._n,
        leave=self._keep,
        colour=self._bar_color,
        desc=self._desc
    )
    train_accuracies = 0.0
    train_ious = 0.0
    train_mses = 0.0
    for data_dict in self._generator:
        if self._eval_mode:
            self._model.eval()
        else:
            self._model.train()
        self._optimizer.zero_grad()
        pred = self._model(
            x_a=data_dict['x1'],
            x_b=data_dict['x2'],
        )
        im_acc = label_accuracy(
            prediction=pred['y1'],
            target=data_dict['y1']
        )
        im_iou = iou_score(
            predictions=pred['y1'],
            targets=data_dict['y1']
        )
        pc_mse = pc_mse_metric(
            prediction=pred['x2'],
            target=data_dict['x2']
        )
        train_accuracies += im_acc
        train_ious += im_iou
        train_mses += pc_mse
        if not self._eval_mode:
            loss = self._criterion(
                predictions=pred, targets={
                    'y1': data_dict['y1'],
                    'x2': data_dict['x2']
                }
            )

            loss.backward()
            self._optimizer.step()
        pb.update(n=1)
    k = "Validation" if self._eval_mode else "Training"
    self._metrics.append(
        {
            k: {
                'Accuracy': (train_accuracies / self._n),
                'mIoU': (train_ious / self._n),
                'MSE': (train_mses / self._n)
            }
        }
    )

Due this problem I am encountering RuntimeError: CUDA out of memory after a few iterations.
What am I doing wrong?

In your code you are accumulating stats in:

        train_accuracies += im_acc
        train_ious += im_iou
        train_mses += pc_mse

which could increase the memory usage, if some of these tensors are still attached to the computation graph since the entire graph would also be stored in each iteration.
Assuming you want to track these statistics without calling backward() on any of these tensors, make sure to .detach() the tensors before adding them or call .item() in case it’s a scalar value.

4 Likes

I am facing the same problem, with memory increasing each epoch during training (high increase, even after the first epoch, almost monotonically).

Is it possible to be the ‘scaler’? I tried removing ‘schedule’. This code works with other models, is it possible to be the model responsible for high GPU RAM increasing after the first epoch?

import torch
import os
from tqdm import tqdm
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt
import torchvision.transforms.functional as tf
import numpy as np
import pandas as pd
from model import UResNet50
from utils import *
import time

# Defining Hyperparameters
learning_rate = 1e-3    # learning rate
device = 'cuda' if torch.cuda.is_available() else 'cpu'
batch_size = 1          # batch size
num_epochs = 60         # number of epochs
num_workers = 2         # number of workers
clip_train = 0.05          # percentage to clip the train dataset (for tests)
clip_valid = 0.05          # percentage to clip the valid dataset (for tests)
valid_percent = 0.2     # use a percent. of train dataset as validation dataset
start_save = 10         # epoch to start saving
image_height = 224      # height to crop the image
image_width = 224       # width to crop the image
pin_memory = True
load_model = False      # 'true' to load a model and test it, or use it
save_model = True       # 'true' to save model trained after epoches
continue_training = False # 'true' to load and continue training a model
save_images = False     # saving example from predicted and original
change_last_fc = False  # to change the last fully connected layer
test_models = False     # 'true' to test the models saved in 'save_results_dir'
last_epoch = 0          # when 'continue_training', write the n° of last epoch


train_image_dir = ['G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Basophil',
                   'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Eosinophil',
                   'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Lymphocyte',
                   'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Monocyte',
                   'G:/Shared drives/Veterinary Microscope/Dataset/Raabin-WBC Data - Nucleus_cytoplasm_Ground truths/GrTh/Original/Neutrophil']

val_image_dir = None
# directory to save the results
save_results_dir = 'C:/Users/marlo/My Drive/College/Biophotonics Lab/Research/Programs/Python/Camera & Image/Microscópio Veterinário/Atuais/ResNet em U'
# directory to load models to test, normally equal to 'save_results_dir'
test_models_dir = 'C:/Users/marlo/My Drive/College/Biophotonics Lab/Research/Programs/Python/Camera & Image/Microscópio Veterinário/Atuais/ResNet em U'

def train_fn(loader, model, optimizer, loss_fn, scaler, schedule, epoch, last_lr):
    loop = tqdm(loader, desc='Epoch '+str(epoch+1))
    
    for batch_idx, (dictionary) in enumerate(loop):
        image, label = dictionary
        x, y = dictionary[image], dictionary[label]
        x, y = x.to(device=device), y.to(device=device)
        y = y.float()

        # Forward
        with torch.cuda.amp.autocast() if torch.cuda.is_available() else torch.autocast('cpu'):
            pred = model(x)
            y = tf.center_crop(y, pred.shape[2:])
            loss = loss_fn(pred, y)
        
        # Backward
        optimizer.zero_grad()
        if device == 'cuda':
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scale = scaler.get_scale()
            scaler.update()
        else:
            # if device='cpu', we cannot use 'scaler=torch.cuda.amp.GradScaler()'
            loss.backward()
            optimizer.step()
        # deliting loss, prediction, x, and y
        loss_item = loss.item()
        del loss, pred, y, x, image, label, dictionary
        #updating tgdm loop
        loop.set_postfix(loss=loss_item)
    # deliting loader and loop
    del loader, loop
    #scheduling learning rate
    if scaler:
        if scale >= scaler.get_scale():
            schedule.step()
            last_lr = schedule.get_last_lr()
    else:
        schedule.step()
        last_lr = schedule.get_last_lr()
    
    return loss_item, last_lr


def main():
    
    model = UResNet50(in_channels=3, num_classes=3).to(device)
    loss_fn = nn.CrossEntropyLoss()
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9)
    schedule = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)
    
    train_loader, valid_loader = get_loaders(
        train_image_dir=train_image_dir,
        valid_percent=valid_percent,
        batch_size=batch_size,
        image_height=image_height,
        image_width=image_width,
        num_workers=num_workers,
        pin_memory=pin_memory,
        val_image_dir=val_image_dir,
        clip_valid=clip_valid,
        clip_train=clip_train
    )
    
    if load_model:
        # Loading checkpoint, if 'cpu', we need to pass 'map_location'
        os.chdir(root_folder)
        if device == 'cuda':
            load_checkpoint(torch.load('my_checkpoint.pth.tar'), model)
        else:
            load_checkpoint(torch.load('my_checkpoint.pth.tar',
                                       map_location=torch.device('cpu')), model)
        check_accuracy(valid_loader, model, loss_fn, device=device)
    
    if not load_model or continue_training:
        # Changing folder to save dictionary
        os.chdir(save_results_dir)
        # If 'continue_training = True', we load the model and continue training
        if continue_training:
            print('- Continue Training...\n')
            start = time.time()
            if device == 'cuda':
                load_checkpoint(torch.load('my_checkpoint.pth.tar'), model,
                                optimizer=optimizer)
            else:
                load_checkpoint(torch.load('my_checkpoint.pth.tar',
                                           map_location=torch.device('cpu')),
                                           model, optimizer=optimizer)
            # reading the csv 'dictionary.csv' as a dictionary
            df = pd.read_csv('dictionary.csv')
            temp = df.to_dict('split')
            temp = temp['data']
            dictionary = {'acc':[], 'loss':[], 'dice score':[], 'time taken':[]}
            for acc, loss, dice_score, time_item in temp:
                dictionary['acc'].append(acc)
                dictionary['loss'].append(loss)
                dictionary['dice score'].append(dice_score)
                dictionary['time taken'].append(time_item)
            # if change the last fully-connected layer:
            if change_last_fc == True:
                print('yess changes')
                model.fc = nn.Linear(21, 5)
                model.cuda()
        elif not continue_training:
            print('- Start Training...\n')
            start = time.time()
            # Opening a 'loss' and 'acc' list, to save the data
            dictionary = {'acc':[], 'loss':[], 'dice score':[], 'time taken':[]}
            acc_item, loss_item, dice_score = check_accuracy(valid_loader, model, loss_fn, device=device)
            dictionary['acc'].append(acc_item)
            dictionary['loss'].append(loss_item)
            dictionary['dice score'].append(dice_score)
            dictionary['time taken'].append((time.time()-start)/60)
        
        if device == 'cuda':
            scaler = torch.cuda.amp.GradScaler()
        else:
            # with 'cpu' we can't use cuda.amp.GradScaler(), we only use autograd
            scaler = None
        
        # to use 'last_lr' in 'train_fn', we have to define it first
        last_lr = schedule.get_last_lr()
        
        # running epochs for
        for epoch in range(num_epochs):
            loss_item, last_lr = train_fn(train_loader, model, optimizer,
                                          loss_fn, scaler, schedule, epoch,
                                          last_lr)
            
            dictionary['loss'].append(loss_item)
            # save model
            if save_model and epoch >= start_save:
                checkpoint = {
                    'state_dict': model.state_dict(),
                    'optimizer': optimizer.state_dict(),
                }
                save_checkpoint(checkpoint, filename='my_checkpoint'+str(epoch+1+last_epoch)+'.pth.tar')
            # check accuracy
            acc_item, temp, dice_score = check_accuracy(valid_loader, model, loss_fn, device=device)
            stop = time.time()
            dictionary['acc'].append(acc_item)
            dictionary['dice score'].append(dice_score)
            dictionary['time taken'].append((stop-start)/60)
            # print some examples to folder
            if save_images:
                save_predictions_as_imgs(
                    valid_loader, model, folder=os.path.join(root_folder,'saved_images'),
                    device=device
                )
            # saving dictionary to csv file
            if save_model:
                df = pd.DataFrame(dictionary, columns = ['acc', 'loss', 'dice score', 'time taken'])
                df.to_csv('dictionary.csv', index = False)
                        
            print('- Time taken:',round((stop-start)/60,3),'min')
            print('- Last Learning rate:', round(last_lr[0],8),'\n')
    
        plt.subplots()
        plt.plot(np.asarray(dictionary['acc'])/100, label ='accuracy')
        plt.plot(np.asarray(dictionary['loss'])/100, label = 'loss')
        plt.plot(np.asarray(dictionary['dice score'])/100, label = 'dice score')
        plt.legend()
        plt.xlabel('Epochs')
        plt.ylabel('Accuracy, Loss, and Dice score')
        plt.show()


if __name__ == '__main__':
    main()

    

No, the GradScaler will not keep unused references around and thus increase the memory usage.
I would recommend to check all returned tensors e.g. from check_accuracy and make sure none of them has a valid .grad_fn since you are storing these tensors.