HELP!: CPU leakage on training vgg16, pytorch uses all CPU cores when training on GPU

Hi. Can anybody help me? Pytorch is using all cores. How do I stop this. I have num_workers=1.

My pytorch code is occupying a lot of CPU memory even thought I am training on GPU.

def train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer, scheduler, net_name, num_epochs):
    Train and evaluate a net.
    # Initialize logs
    fname = os.path.join(args.model_dir, f'train{fold}.log')
    logging_train = myutils.setup_logger(fname)
    fname = os.path.join(args.model_dir, f'lr{fold}.log')
    logging_lr = myutils.setup_logger(fname)
    # Reproducibility
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # Load initial weights
    net =
    best_net_wts = copy.deepcopy(net.state_dict())
    best_acc, epoch = 0.0, 1

    # Initialize .tar files to save settings
    fname = f'last{fold}.tar'
    last_path = os.path.join(args.model_dir, fname)
    fname = f'best{fold}.tar'
    best_path = os.path.join(args.model_dir, fname)

    # To resume training for more epochs
    if args.resume:
            # Load last settings from .tar file
            last_checkpoint = torch.load(last_path)
            epoch = last_checkpoint['epoch'] + 1  # Since last epoch was saved we start with the next one
  'Model: {args.model_dir}\tLast epoch saved: {epoch-1}, resumming training since epoch: {epoch}')

            # Load best settings from .tar file
            best_checkpoint = torch.load(best_path)
            best_net_wts = best_checkpoint['net_state_dict']
            best_acc = best_checkpoint['acc']

        except FileNotFoundError as err:
            # This error happens when folds are present
            # If interrupted on fold 1 then best best_checkpoint for fold 2 does
            # not exists. This is fixed like this.
  'Model: {args.model_dir}\tError: {err}')

    for epoch in range(epoch, num_epochs+1):

        print(f'Epoch {epoch}/{num_epochs}')'Epoch {epoch}/{num_epochs}')

        # Each epoch has a training phase and a validation phase
        for phase in ['train','val']:
            if phase == 'train':
                net.train()  # Set net to training mode
                mylr_value = optimizer.param_groups[0]['lr']
      'Epoch {epoch}\tlr: {mylr_value}')
                net.eval()   # Set net to evaluate mode

            # Track statistics
            running_loss = 0.0
            running_corrects = 0

            # Iterate over data
            for inputs, labels in dataloaders[phase]:
                inputs =
                labels =

                # Zero the parameter gradients

                # Forward
                # Track history if only in train
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = net(inputs)
                    _, targets = torch.max(labels, 1)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, targets)

                    # Backward + optimize only if in training phase
                    if phase == 'train':

                # Batch statistics
                running_loss += loss.detach().item() * inputs.size(0)  # This is batch loss
                running_corrects += torch.sum(preds ==  # This is batch accuracy

            # efficientnetb
            if net_name.startswith('efficientnetb'):
                if phase == 'train':

            # inceptionv
            if net_name.startswith('inceptionv'):
                if phase == 'train':
                    if (epoch % 2) == 0:

            # Epoch statistics
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print('{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))
  '{} Loss: {:.4f} Acc: {:.4f}'.format(phase, epoch_loss, epoch_acc))

            if phase == 'val':

                # Save last settings to .tar file
                'epoch': epoch,
                'net_state_dict': net.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': epoch_loss
                }, last_path)

                if epoch_acc > best_acc:
                    best_acc = epoch_acc
                    best_net_wts = net.state_dict()

                    # Save best settings to .tar file
                    'epoch': epoch,
                    'net_state_dict': best_net_wts,
                    'optimizer_state_dict': optimizer.state_dict(),
                    'loss': epoch_loss,
                    'acc': best_acc
                    }, best_path)

                    # Save best settings to .json file
                    best_metrics = {
                    f'loss{fold}': epoch_loss,
                    f'acc{fold}': best_acc.item()
                    fname = os.path.join(args.model_dir, f'metrics{fold}.json')
                    with open (fname, 'w') as f:

                    # vgg
                    if net_name.startswith('vgg'):

                    # resnet
                    if net_name.startswith('resnet'):

    print('Best val Acc: {:4f}'.format(best_acc))'Model: {}\tFold: {}\tBest val Acc: {:4f}'.format(args.model_dir, fold, best_acc))

if __name__ == '__main__':
    args = parser.parse_args()

    assert os.path.isdir(args.data_dir), "Could not find the dataset at {}".format(args.data_dir)
    assert os.path.isdir(args.model_dir), "Could not find the model at {}".format(args.model_dir)
    assert os.path.isdir(args.net_dir), "Could not find the network at {}".format(args.net_dir)

    # Initialize main log folder
    logs_dir_path = os.path.join(os.getcwd(),'Logs')
    if not os.path.exists(logs_dir_path):

    # Initialize main log file
    log_file = os.path.join(logs_dir_path, 'process.log')
    logging_process = myutils.setup_logger(log_file, date=True)

    # Save commandline settings to log
    script_activated = ' '.join(sys.argv)
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")'Script: {script_activated}, device: {device}')

    # Get the experiment parameters
    params_file = os.path.join(args.model_dir, 'params.json')
    assert os.path.isfile(params_file), "No json configuration file found at {}".format(params_file)
    params = myutils.Params(params_file)

    dfs = {}
    for fold in range(args.fold_start, args.folds+1):
        # Load data from .csv files
        tname = os.path.join(args.data_dir, 'train.csv')
        vname = os.path.join(args.data_dir, 'val.csv')

        if args.folds > 1:
            # Load data from .csv files
            tname = os.path.join(args.data_dir, f'train{fold}.csv')
            vname = os.path.join(args.data_dir, f'val{fold}.csv')

        train = pd.read_csv(tname)
        val = pd.read_csv(vname)
        dfs['train'] = train
        dfs['val'] = val
        mean, std = myutils.get_stats(train, params.size)'Model: {args.model_dir}\tFold: {fold}\tTrain: {tname}\tMean: {mean}\tStd: {std}')

        # Data
        loaders = myutils.get_module(args.net_dir, 'loaders')
        dataloaders, dataset_sizes = loaders.get_loaders(dfs, mean, std, size=params.size, batch_size=params.batch_size, num_workers=params.num_workers)
        # Net
        net = myutils.get_network(args.net_dir,
        optimizer = myutils.get_optimizer(params.optimizer, net, params.learning_rate, params.momentum, params.weight_decay)
        # Schedulers
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, mode='max')
            scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=2.4, gamma=0.97)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, mode='min')
            scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.94)
        # Loss function
        weight = myutils.get_weight(train)
        weight =
        criterion = myutils.get_loss_fn(args.net_dir,, weight)'Model: {args.model_dir}\tFile: train{fold}.csv\tWeight: {weight}')

        # Train
        print(f'Fold {fold}')
        print('-'*10)'Model: {args.model_dir}\tFold: {fold}, training has started for {params.num_epochs} epochs')
        train_eval(fold, dataloaders, dataset_sizes, net, criterion, optimizer,  scheduler,, num_epochs=params.num_epochs)'Model: {args.model_dir}\tFold: {fold}, training has ended')

Are you seeing some GPU activity?
If so, could you run the model training in isolation using random input data as well as the data loading in isolation without the training to narrow down, which part of the code uses all CPUs?

Yes. I am seeing around 30% GPU usage with gpustat. It varies from 0% to 30%.
I was using these settings to train.


I tried the tests in isolation. The leak happens in the data loading. This is my dataloader code:

import torch
import torchvision
import myutils

from import DataLoader, Dataset
from torchvision import transforms
from PIL import Image
from PIL import ImageFile

def get_loaders(dfs, mean, std, size, batch_size, num_workers):
    Function that takes a dictionary of dataframes and
    returns 2 dictionaries of pytorch dataloaders and dataset_sizes
    # Reproducibility

    # Custom pytorch dataloader for this dataset
    class Derm(Dataset):
        Read a pandas dataframe with
        images paths and labels
        def __init__(self, df, transform=None):
            self.df = df
            self.transform = transform

        def __len__(self):
            return len(self.df)

        def __getitem__(self, index):
                # Load image data and get label
                X =['filenames'][index]).convert('RGB')
                y = torch.tensor(self.df.iloc[index,2:])
            except IOError as err:

            if self.transform:
                X = self.transform(X)
            # Sanity check
            #print(info('id:', self.df['id'][index], 'label', y)
            return index, X, y

    # Transforms
    data_transforms = {'train' : transforms.Compose([transforms.Resize(size),
                       'val' : transforms.Compose([transforms.Resize(size),
                       'test' : transforms.Compose([transforms.Resize(size),
                       'unknown' : transforms.Compose([transforms.Resize(size),

    # Sets
    image_datasets = {x: Derm(dfs[x], transform=data_transforms[x]) for x in dfs.keys()}
    # Sizes
    dataset_sizes = {x: len(image_datasets[x]) for x in dfs.keys()}
    # Loaders
    dataloaders = {x: DataLoader(image_datasets[x], batch_size, num_workers, pin_memory=False) for x in dfs.keys()}

    return dataloaders, dataset_sizes

I also deleted:

best_net_wts = copy.deepcopy(net.state_dict())

because is a variable that I was not using. This helped a bit but the overhead is still high.
Like PIL seams to use all CPU cores.

I don’t know, if PIL is able to use all cores for the processing, but you could verify it by running PIL transformations in isolation and check the CPU usage.
Did you replace the “vanilla” PIL with any other drop-in library?

CC @fmassa: in case you’ve seen this behavior before.

Thanks for the isolation advice. It did came down to the PIL Image opening using all the cores. I read that is a python issue more than PIL.

I had conda install pillow=6.2.1
And I did found a very similar issue open in github with Pillow:

I found a solution:

Environment variable: MAX_CONCURRENCY=n . Pillow can use multiprocessing to build the extension. Setting MAX_CONCURRENCY sets the number of CPUs to use, or can disable parallel building by using a setting of 1. By default, it uses 4 CPUs, or if 4 are not available, as many as are present.

Also I did change all resizing and preprocessing outside torch.