Underutilization of GPU during training


I am trying to classify a generated 2 class checkerboard data, shown below, with a simple fully connected dense network with a few hidden layers. I notice that < 25% of the GPU is being used during training. Could you please advice on what could be causing that? I have attached the code below. I also ran the code with torch.utils.bottleneck. I have attached the output of the profiler in case it is of help.

Thanks in advance!

Checkerboard Data


import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from math import sqrt, ceil
from collections import OrderedDict

import torch
import torch.cuda as cuda
import torch.nn as nn
import torch.optim as optim

if cuda.is_available():
    device = torch.device("cuda:0")
    print('device', device)
    device = torch.device("cpu")
    print('device', device)

def plot_data(X, y, num_classes, save_as):
    f, ax = plt.subplots(nrows=1, ncols=1,figsize=(15,8))
    colors = ['tab:blue', 'tab:orange', 'tab:green', 'tab:purple'][0:num_classes]
    sns.scatterplot(x=X[:,0],y=X[:,1],hue=y,palette=colors, ax=ax)
    ax.set_title("2D 3 classes Generated Data")
    plt.ylabel('Dim 2')
    plt.xlabel('Dim 1')

def generate_data(dataset, size):
    num_classes = 2
    X = y = None        
    X = 2 * np.random.random((size,2)) - 1
    def classifier2(X): # a 4x4 checkerboard pattern -- you can use the same method to make up your own checkerboard patterns
        return (np.sum( np.ceil( 2 * X).astype(int), axis=1 ) % 2).astype(float)
    y = classifier2( X )

    plot_data(X, y, num_classes, 'figures/all/'+dataset+'_'+str(num_classes)+'_.png')

    x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


    # Create trainloader
    batchsize = 32
    trainset = torch.utils.data.TensorDataset(torch.tensor(x_train, dtype=torch.float32), 
                                              torch.tensor(y_train, dtype=torch.long))
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batchsize,
                                              shuffle=True, num_workers=0, pin_memory=True)
    testset = torch.utils.data.TensorDataset(torch.tensor(x_test, dtype=torch.float32),
                                             torch.tensor(y_test, dtype=torch.long))
    testloader = torch.utils.data.DataLoader(testset, batch_size=len(testset),
                                             shuffle=True, num_workers=0, pin_memory=True)

    return X, y, trainset, trainloader, testset, testloader, num_classes

class single_model_deep(nn.Module):
    def __init__(self, parameters, num_experts, num_classes, bias=None):
        super(single_model_deep, self).__init__()
        output = float(parameters)/(4*(num_experts+1)*8)
        output = (sqrt(6*parameters + 9)/3 - 1)/(num_experts + 1)
        output = ceil(output)
        if output <= 0.0:
            output = 1
        print('parameters', parameters, 'num_experts', num_experts+1, 'output', output)
        self.model = nn.Sequential(OrderedDict({
            'linear_1':nn.Linear(2, (num_experts+1)*output),
            'linear_2':nn.Linear((num_experts+1)*output, (num_experts+1)*output),
            'linear_3':nn.Linear((num_experts+1)*output, (num_experts+1)*int(output/2)),
        if not bias is None:
            layers = dict(self.model.named_children())
            with torch.no_grad():

        self.model = self.model.to(device)
    def forward(self, input):
        return self.model(input)

    def train(self, trainloader, testloader, optimizer, loss_criterion, accuracy, epochs):    

        history = {'loss':[], 'accuracy':[], 'val_accuracy':[]}
        for epoch in range(0, epochs):
            running_loss = 0.0
            training_accuracy = 0.0
            test_accuracy = 0.0
            i = 0
            for inputs, labels in trainloader:
                # get the inputs; data is a list of [inputs, labels]
                inputs, labels = inputs.to(device, non_blocking=True), labels.to(device, non_blocking=True)
                # zero the parameter gradients
                # forward + backward + optimize
                outputs = self(inputs)
                loss = loss_criterion(outputs, labels)
                running_loss += loss.item()
                training_accuracy += accuracy(outputs, labels)


            j = 0
            for test_input, test_labels in testloader:
                test_input, test_labels = test_input.to(device), test_labels.to(device)
                test_outputs = self(test_input)
                test_accuracy += accuracy(test_outputs, test_labels)
            print('epoch: %d loss: %.3f training accuracy: %.3f val accuracy: %.3f' %
                  (epoch + 1, running_loss / i, training_accuracy/i, test_accuracy/j))
        return history

def accuracy(out, yb):
    preds = torch.argmax(out, dim=1)
    return (preds == yb).float().mean()

X, y, trainset, trainloader, testset, testloader, num_classes = generate_data('checker_board-2', 3000)

num_experts = 2
num_classes = 2

model = single_model_deep(2062, num_experts, num_classes)

optimizer = optim.RMSprop(model.parameters(),
                          lr=0.001, momentum=0.9)

epochs = 40
hist = model.train(trainloader, testloader, optimizer, nn.CrossEntropyLoss(), accuracy, epochs=epochs)

torch.utils.bottleneck output

`bottleneck` is a tool that can be used as an initial step for debugging
bottlenecks in your program.

It summarizes runs of your script with the Python profiler and PyTorch's
autograd profiler. Because your script will be profiled, please ensure that it
exits in a finite amount of time.

For more complicated uses of the profilers, please see
https://docs.python.org/3/library/profile.html and
https://pytorch.org/docs/master/autograd.html#profiler for more information.
Running environment analysis...
Running your script with cProfile
device cuda:0
  Environment Summary
PyTorch 1.6.0 compiled w/ CUDA 10.2
Running with Python 3.8 and 

`pip3 list` truncated output:
  cProfile output
         3731069 function calls (3669088 primitive calls) in 8.131 seconds

   Ordered by: internal time
   List reduced from 6411 to 15 due to restriction <15>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     6088    2.146    0.000    2.146    0.000 {method 'to' of 'torch._C._TensorBase' objects}
     3000    1.026    0.000    1.026    0.000 {method 'run_backward' of 'torch._C._EngineBase' objects}
    12160    0.570    0.000    0.570    0.000 {built-in method addmm}
    48000    0.330    0.000    0.330    0.000 {method 'mul_' of 'torch._C._TensorBase' objects}
    48000    0.290    0.000    0.290    0.000 {method 'add_' of 'torch._C._TensorBase' objects}
   360000    0.239    0.000    0.239    0.000 /home/local/peac004/anaconda3/envs/pytorch1.6.0/lib/python3.8/site-packages/torch/utils/data/dataset.py:162(<genexpr>)
     6080    0.166    0.000    0.166    0.000 {built-in method stack}
    24000    0.163    0.000    0.163    0.000 {method 'sqrt' of 'torch._C._TensorBase' objects}
     3000    0.144    0.000    1.229    0.000 /home/local/peac004/anaconda3/envs/pytorch1.6.0/lib/python3.8/site-packages/torch/optim/rmsprop.py:55(step)
      790    0.140    0.000    0.140    0.000 {method 'read' of '_io.BufferedReader' objects}
    24000    0.135    0.000    0.135    0.000 {method 'addcmul_' of 'torch._C._TensorBase' objects}
    24000    0.115    0.000    0.115    0.000 {method 'addcdiv_' of 'torch._C._TensorBase' objects}
33400/6040    0.101    0.000    1.164    0.000 /home/local/peac004/anaconda3/envs/pytorch1.6.0/lib/python3.8/site-packages/torch/nn/modules/module.py:710(_call_impl)
     9120    0.096    0.000    0.096    0.000 {built-in method relu}
    23992    0.093    0.000    0.093    0.000 {method 'zero_' of 'torch._C._TensorBase' objects}

  autograd profiler output (CPU mode)
        top 15 events sorted by cpu_time_total

-----------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------------------------------------  
Name               Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     CUDA total %     CUDA total       CUDA time avg    Number of Calls  Input Shapes                                   
-----------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------------------------------------  
stack              15.90%           194.232ms        15.90%           194.232ms        194.232ms        NaN              0.000us          0.000us          1                []                                             
cat                15.89%           194.190ms        15.89%           194.190ms        194.190ms        NaN              0.000us          0.000us          1                []                                             
_cat               15.89%           194.187ms        15.89%           194.187ms        194.187ms        NaN              0.000us          0.000us          1                []                                             
stack              7.98%            97.506ms         7.98%            97.506ms         97.506ms         NaN              0.000us          0.000us          1                []                                             
cat                7.98%            97.463ms         7.98%            97.463ms         97.463ms         NaN              0.000us          0.000us          1                []                                             
_cat               7.98%            97.459ms         7.98%            97.459ms         97.459ms         NaN              0.000us          0.000us          1                []                                             
TBackward          4.04%            49.404ms         4.04%            49.404ms         49.404ms         NaN              0.000us          0.000us          1                []                                             
t                  4.04%            49.401ms         4.04%            49.401ms         49.401ms         NaN              0.000us          0.000us          1                []                                             
stack              4.03%            49.291ms         4.03%            49.291ms         49.291ms         NaN              0.000us          0.000us          1                []                                             
cat                4.03%            49.250ms         4.03%            49.250ms         49.250ms         NaN              0.000us          0.000us          1                []                                             
_cat               4.03%            49.247ms         4.03%            49.247ms         49.247ms         NaN              0.000us          0.000us          1                []                                             
stack              2.16%            26.343ms         2.16%            26.343ms         26.343ms         NaN              0.000us          0.000us          1                []                                             
unsqueeze          2.02%            24.658ms         2.02%            24.658ms         24.658ms         NaN              0.000us          0.000us          1                []                                             
as_strided         2.02%            24.644ms         2.02%            24.644ms         24.644ms         NaN              0.000us          0.000us          1                []                                             
AddmmBackward      2.02%            24.634ms         2.02%            24.634ms         24.634ms         NaN              0.000us          0.000us          1                []                                             
-----------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------------------------------------  
Self CPU time total: 1.222s
CUDA time total: 0.000us

  autograd profiler output (CUDA mode)
        top 15 events sorted by cpu_time_total

	Because the autograd profiler uses the CUDA event API,
	the CUDA time column reports approximately max(cuda_time, cpu_time).
	Please ignore this output if your code does not use CUDA.

-----------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------------------------------------  
Name               Self CPU total %  Self CPU total   CPU total %      CPU total        CPU time avg     CUDA total %     CUDA total       CUDA time avg    Number of Calls  Input Shapes                                   
-----------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------------------------------------  
stack              16.39%           200.653ms        16.39%           200.653ms        200.653ms        16.44%           200.594ms        200.594ms        1                []                                             
cat                16.37%           200.442ms        16.37%           200.442ms        200.442ms        16.41%           200.278ms        200.278ms        1                []                                             
_cat               16.37%           200.435ms        16.37%           200.435ms        200.435ms        16.41%           200.273ms        200.273ms        1                []                                             
stack              7.81%            95.617ms         7.81%            95.617ms         95.617ms         7.82%            95.463ms         95.463ms         1                []                                             
cat                7.79%            95.413ms         7.79%            95.413ms         95.413ms         7.80%            95.138ms         95.138ms         1                []                                             
_cat               7.79%            95.406ms         7.79%            95.406ms         95.406ms         7.80%            95.133ms         95.133ms         1                []                                             
TBackward          4.09%            50.112ms         4.09%            50.112ms         50.112ms         4.11%            50.112ms         50.112ms         1                []                                             
t                  4.09%            50.105ms         4.09%            50.105ms         50.105ms         4.11%            50.106ms         50.106ms         1                []                                             
stack              3.30%            40.384ms         3.30%            40.384ms         40.384ms         3.30%            40.224ms         40.224ms         1                []                                             
cat                3.28%            40.199ms         3.28%            40.199ms         40.199ms         3.27%            39.907ms         39.907ms         1                []                                             
_cat               3.28%            40.194ms         3.28%            40.194ms         40.194ms         3.27%            39.902ms         39.902ms         1                []                                             
stack              3.08%            37.701ms         3.08%            37.701ms         37.701ms         3.09%            37.747ms         37.747ms         1                []                                             
stack              2.18%            26.649ms         2.18%            26.649ms         26.649ms         2.00%            24.450ms         24.450ms         1                []                                             
AddmmBackward      2.08%            25.521ms         2.08%            25.521ms         25.521ms         2.09%            25.521ms         25.521ms         1                []                                             
mm                 2.08%            25.423ms         2.08%            25.423ms         25.423ms         2.08%            25.410ms         25.410ms         1                []                                             
-----------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------  ---------------------------------------------  
Self CPU time total: 1.224s
CUDA time total: 1.220s

Hey. The number of model weights and number of batches (of only 2-dimensional data samples) are too low to saturate all GPU memory and compute. You can try to increase the batch size to see the increase of GPU utilization.