GPU utilization 0% but dedicated memory is full

Hello,

I am training a ViT network using pytorch. The training process is too slow. When I checked the task manager, I found although the dedicated GPU memory is fully used, GPU utilization is 0%. To be precise, first the GPU utilization increases and then after a few seconds, it decreases to 0.

I checked this method but the situation is still the same as before.
Also, I checked this method and got the following output which is weird:
data loading time: 59.500293016433716
data loading time: 60.46416115760803
data loading time: 61.4690420627594
data loading time: 62.47226524353027
data loading time: 63.4786810874939
data loading time: 64.48584127426147
data loading time: 65.48938298225403

The problem is that I am only using random tensors why the data loading time is so high?

This is my code for data preparation:

class MyDataset(Dataset):
    def __init__(self, img_folder, mask_folder, ratings, config, transform=None):
        self.data_list = os.listdir(img_folder)
        self.img_folder = img_folder
        self.mask_folder = mask_folder
        self.transform = transform
        self.config = config
        self.ratings = ratings
        self.data = {}

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        img_name = self.data_list[idx]
        # image = torch.load(os.path.join(self.img_folder, img_name))
        # print(img_name)
        #mask_name = img_name.split('.')[0] + '_mask.' + img_name.split('.')[1]
        #img_path = os.path.join(self.img_folder, img_name)
        #if self.mask_folder is not None:
        #    mask_path = os.path.join(self.mask_folder, mask_name)
        #else:
        #    mask_path = None
        #image = prepare_image_mask(img_path, mask_path, self.config)
        image = torch.rand(705,3075)
        labels = torch.Tensor(self.ratings[img_name.split('.')[0]])

        return image, labels


def prepare_dataloaders(config, ratings):
    my_dataset = MyDataset(config.path.img_folder, config.path.mask_folder, ratings, config.data)

    config_test = copy.deepcopy(config)
    config_test.data.max_seq_len_from_original_res = -1
    config_test.training.batch_size = 1  # As we get inference on the original resolution
    my_testset = MyDataset(config.path.test_folder, config.path.test_mask_folder, ratings, config_test.data)

    # Split the dataset into train and test sets
    train_dataset, val_dataset = torch.utils.data.random_split(my_dataset ,
                                                               [int(len(my_dataset ) * config.training.train_size),
                                                                len(my_dataset ) - int(
                                                                    len(my_dataset ) * config.training.train_size)])

    # Create DataLoader for train and test sets
    train_loader = DataLoader(train_dataset, batch_size=config.training.batch_size, shuffle=config.training.shuffle_data
                              , num_workers=config.training.num_workers)
    val_loader = DataLoader(val_dataset, batch_size=config.training.batch_size, shuffle=config.training.shuffle_data,
                            num_workers=config.training.num_workers)
    test_loader = DataLoader(my_testset, batch_size=config_test.training.batch_size,
                             shuffle=config_test.training.shuffle_data,
                             num_workers=config_test.training.num_workers)

    dataloaders = {
        'train': train_loader,
        'val': val_loader,
        'test': test_loader
    }
    return dataloaders

If removing the actual model training and purely executing the data loading loop didn’t change anything in regarding the runtime, you might indeed suffer from a data loading bottleneck. I’m not familiar with your use case and how ratings is defined, but creating random tensors shouldn’t take a minute.

Thank you very much for your quick response. Here, is my training code which looks normal to me.


torch.backends.cudnn.benchmark = True  #https://discuss.pytorch.org/t/gpu-utilisation-low-but-memory-usage-high/140025/2


def train_on_gpu(rank, model, device_ids, dataloaders, config, checkpoint_folder, logger, log_path):

    rank = rank - 1
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

    writer = SummaryWriter(log_dir=log_path)

    if config.training.multi_gpu:
        model = nn.DataParallel(model, device_ids=[rank])
        model = model.to(device)
    else:
        model = model.to(device)

    # logger.info("---------- Prepare optimizers and learning rate scheduler----------")
    if config.training.optimizer == 'sgd':
        optimizer = optim.SGD(model.parameters(), lr=float(config.training.learning_rate),
                              momentum=float(config.training.momentum))
    elif config.training.optimizer == 'Adam':
        optimizer = optim.Adam(model.parameters(), lr=float(config.training.learning_rate))
    else:
        raise ValueError('Only Adam and sgd optimizers are supported.')

    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=150, eta_min=0.00001)

    # logger.info("---------- Prepare evaluation criteria and loss function ----------")
    emd = EMDLoss(r=2)
    metrics = Metrics()

    best_emd_loss = float('inf')

    scaler = GradScaler()

    if config.training.warm_start:
        checkpoint = torch.load(config.path.warm_start_checkpoint)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        epoch = checkpoint['epoch']
    else:
        epoch = 0

    # Training loop
    for epoch in tqdm(range(epoch, config.training.num_epochs)):
        # logger.info(f"Epoch {epoch} ")
        model.train()

        if config.model.frozen_vit:
            for param in model.encoder.parameters():
                param.requires_grad = False

        score_values = torch.arange(1, config.model.num_classes + 1, dtype=torch.float32, device=device)
        total_loss = 0.0

        end = time.time()
        for index, sample in enumerate(dataloaders['train']):
            # measure data loading time
            data_loading_time = time.time() - end
            print(f'data loading time: {data_loading_time}')

            inputs, labels = sample
            inputs, labels = inputs.to(device), labels.to(device)
            normalized_labels = labels / labels.sum(dim=1, keepdim=True)

            # logger.info(" -- Make prediction --")
            outputs = model(inputs)

            # logger.info(" -- Calculate loss --")
            loss = emd(outputs, normalized_labels)
            loss = loss / config.training.accumulation_steps

            # logger.info(" -- Backpropagation --")
            scaler.scale(loss).backward()

            if (index + 1) % config.training.accumulation_steps == 0:
                # may unscale_ here if desired (e.g., to allow clipping unscaled gradients)
                if config.training.gradient_clipping:
                    scaler.unscale_(optimizer)
                    # Apply gradient clipping
                    nn.utils.clip_grad_norm_(model.parameters(), config.training.max_grad_norm)

                scaler.step(optimizer)
                scaler.update()
                optimizer.zero_grad(set_to_none=True)  # https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html

            total_loss += loss.item()
            # logger.info(f" -- loss: {loss.item()} --")
            writer.add_scalar("EMD Loss/train", loss.item(), index + epoch * (len(dataloaders['train'])))

        scheduler.step()

        # Print the average loss for this epoch
        avg_loss = total_loss / len(dataloaders['train'])
        writer.add_scalar("EMD Loss/average train", avg_loss, epoch)

        # Validation after each epoch
        # logger.info("---------- Start validation ----------")
        total_val_loss = 0.0
        preds_list = []
        labels_list = []
        with torch.no_grad():
            for index, (inputs, labels) in enumerate(dataloaders['val']):
                inputs, labels = inputs.to(device), labels.to(device)
                normalized_labels = labels / labels.sum(dim=1, keepdim=True)
                # logger.info(" -- Make prediction --")
                preds = model(inputs)

                val_loss = emd(preds, normalized_labels)
                # logger.info(f" -- val loss: {val_loss.item()} --")
                writer.add_scalar("EMD Loss/validation", val_loss.item(), index + epoch * (len(dataloaders['val'])))

                total_val_loss += val_loss

                preds = torch.sum(preds * score_values, dim=-1)
                normalized_labels = torch.sum(normalized_labels * score_values, dim=-1)
                preds_list.extend(preds.data.cpu().numpy())
                labels_list.extend(normalized_labels.data.cpu().numpy())

        avg_val_loss = total_val_loss / len(dataloaders['val'])
        writer.add_scalar("EMD Loss/average validation", avg_val_loss, epoch)

        # logger.info(" -- Calculate val metrics --")
        plcc, srcc, accuracy, mse, mae = metrics.calculate(preds_list, labels_list)
        writer.add_scalar("Metrics/accuracy val", accuracy, epoch)
        writer.add_scalar("Metrics/mse val", mse, epoch)
        writer.add_scalar("Metrics/mae val", mae, epoch)
        writer.add_scalar("Metrics/pearson val", plcc, epoch)
        writer.add_scalar("Metrics/spearman val", srcc, epoch)

        # logger.info(" -- Update learning rate with scheduler --")

        current_lr = optimizer.param_groups[0]['lr']
        # logger.info(f"new learning rate: {current_lr}")
        writer.add_scalar("Learning Rate", current_lr, epoch)

    writer.close()


def train(config, num_gpus, logger, log_path):

    device_ids = list(range(num_gpus))
    logger.info("Training started...")
    checkpoint_folder = "checkpoints"
    os.makedirs(checkpoint_folder, exist_ok=True)

    # Data preparation
    ratings = {}
    with open(config.path.ratings_path, newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter=' ')
        for row in reader:
            id = row[1]
            rates = [int(x) for x in row[2:-3]]
            ratings[str(id)] = rates

    logger.info("---------- Create dataloaders ----------")
    dataloaders = prepare_dataloaders(config, ratings)

    logger.info("---------- Prepare model ----------")
    model = Model(config.model)

    total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    logger.info(f"---------- Total number of trainable parameters: {total_params} ----------")

    train_on_gpu(num_gpus, model, device_ids, dataloaders, config, checkpoint_folder,
                                 logger, log_path)


if __name__ == "__main__":
    with open('config.yaml', 'r') as f:
        config = yaml.load(f, Loader=yaml.FullLoader)
        config = ml_collections.ConfigDict(config)
    num_gpus = torch.cuda.device_count()
    logger = get_logger(__name__)
    log_path = prepare_path_writer()
    train(config, num_gpus, logger, log_path)

Update:
Using the following changes in the windows, GPU utilization increased from 0 to around 5%.
Go to settings → display → graphics → change default graphic settings → turn off the hardware-accelerated GPU scheduling