GPU utilisation low but memory usage high

GPU utilisation is low but memory is usage high. This is my code:

from tqdm import tqdm
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
import torch.nn as nn
from torch.functional import split
import torch
import ssl
ssl._create_default_https_context = ssl._create_unverified_context


class VGG(nn.Module):
    def __init__(self, in_channels=3, num_classes=1000):
        super(VGG, self).__init__()
        self.in_channels = in_channels
        self.conv_layers = self.create_conv_layers(VGG16)

        self.fcs = nn.Sequential(
            nn.Linear(512*7*7, 4096),
            nn.ReLU(),
            nn.Dropout(p=0.5),
            nn.Linear(4096, 4096),
            nn.ReLU(), nn.Dropout(p=0.6), nn.Linear(4096, num_classes)
        )

    def forward(self, x):
        x = self.conv_layers(x)
        x = x.reshape(x.shape[0], -1)
        x = self.fcs(x)
        return x

    def create_conv_layers(self, architecture):
        layers = []
        in_channels = self.in_channels

        for x in architecture:
            if type(x) == int:
                out_channels = x

                layers += [nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
                                     kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
                           nn.BatchNorm2d(x),
                           nn.ReLU()
                           ]

                in_channels = out_channels
            else:
                layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]

        return nn.Sequential(*layers)


if __name__ == '__main__':
    VGG16 = [64, 64, 'M', 128, 128, 'M', 256, 256, 256,
             'M', 512, 512, 512, 'M', 512, 512, 512, 'M']

    batch_size = 32
    lr = 0.0001
    device = "cuda"
    num_epochs = 1
    transform = transforms.Compose(
        [transforms.ToTensor(), transforms.Resize((224, 224))])

    train_dataset = CIFAR100(
        root='dataset/', train=True, transform=transform, download=True)
    train_loader = DataLoader(dataset=train_dataset,
                              batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=True)

    model = VGG().to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(num_epochs):
        losses = []

        with tqdm(train_loader, unit="Batch", desc=f"Epoch {epoch + 1}") as tepoch:
            num_batches = len(tepoch)

            for batch_idx, (data, targets) in enumerate(tepoch):
                data = data.to(device=device)
                targets = targets.to(device=device)

                preds = model(data)
                loss = criterion(preds, targets)
                losses.append(loss.item())

                optimizer.zero_grad()
                loss.backward()

                optimizer.step()

I used the pytorch profiler. This is what is got:

--------------------------------------------------------------------------------
  Environment Summary
--------------------------------------------------------------------------------
PyTorch 1.10.0 DEBUG compiled w/ CUDA 11.3
Running with Python 3.8 and CUDA 11.2.67

`pip3 list` truncated output:
mypy-extensions==0.4.3
numpy==1.20.1
numpydoc==1.1.0
torch==1.10.0
torch-tb-profiler==0.3.1
torchaudio==0.10.0
torchvision==0.11.1
--------------------------------------------------------------------------------
  cProfile output
--------------------------------------------------------------------------------
         6012080 function calls (5926529 primitive calls) in 631.801 seconds

   Ordered by: internal time
   List reduced from 1550 to 15 due to restriction <15>

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
     3223  379.244    0.118  379.244    0.118 {method 'to' of 'torch._C._TensorBase' objects}
     1565  204.858    0.131  204.858    0.131 {method 'item' of 'torch._C._TensorBase' objects}
       12   12.053    1.004   12.054    1.005 {method 'dump' of '_pickle.Pickler' objects}
    20319    8.883    0.000    8.883    0.000 {built-in method conv2d}
     1563    8.339    0.005    8.339    0.005 {method 'run_backward' of 'torch._C._EngineBase' objects}
     4689    2.900    0.001    2.900    0.001 {built-in method torch._C._nn.linear}
     1563    1.661    0.001    6.370    0.004 C:\Users\techi\anaconda3\lib\site-packages\torch\optim\_functional.py:54(adam)
    20319    1.608    0.000    1.608    0.000 {built-in method batch_norm}
   181308    1.338    0.000    1.338    0.000 {method 'mul_' of 'torch._C._TensorBase' objects}
   181308    1.272    0.000    1.272    0.000 {method 'add_' of 'torch._C._TensorBase' objects}
    90654    0.751    0.000    0.751    0.000 {method 'sqrt' of 'torch._C._TensorBase' objects}
    90654    0.689    0.000    0.689    0.000 {method 'addcdiv_' of 'torch._C._TensorBase' objects}
    23445    0.648    0.000    0.648    0.000 {built-in method relu}
    90654    0.595    0.000    0.595    0.000 {method 'addcmul_' of 'torch._C._TensorBase' objects}
       32    0.569    0.018    0.569    0.018 {method 'uniform_' of 'torch._C._TensorBase' objects}


--------------------------------------------------------------------------------
  autograd profiler output (CUDA mode)
--------------------------------------------------------------------------------
        top 15 events sorted by cpu_time_total

        Because the autograd profiler uses the CUDA event API,
        the CUDA time column reports approximately max(cuda_time, cpu_time).
        Please ignore this output if your code does not use CUDA.

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls 

-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                               Optimizer.step#Adam.step        20.72%     498.367ms        20.91%     503.092ms     503.092ms     611.000us         9.00%     328.025ms     328.025ms             1  
                                         aten::uniform_        18.01%     433.238ms        18.01%     433.238ms     433.238ms       1.000us         0.01%       1.000us       1.000us             1  
enumerate(DataLoader)#_MultiProcessingDataLoaderIter...        14.69%     353.386ms        14.69%     353.390ms     353.390ms       2.000us         0.03%       3.000us       3.000us             1  
                     Optimizer.zero_grad#Adam.zero_grad        13.09%     314.864ms        13.12%     315.547ms     315.547ms      77.000us         1.13%     317.186ms     317.186ms             1  
                                               aten::to         0.00%      10.000us        11.90%     286.182ms     286.182ms       2.000us         0.03%       2.141ms       2.141ms             1  
                                         aten::_to_copy         0.00%      19.000us        11.90%     286.172ms     286.172ms       4.000us         0.06%       2.139ms       2.139ms             1  
                                            aten::copy_        11.90%     286.142ms        11.90%     286.142ms     286.142ms       2.134ms        31.43%       2.134ms       2.134ms             1  
                                               aten::to         0.00%       7.000us        10.90%     262.290ms     262.290ms       2.000us         0.03%       2.135ms       2.135ms             1  
                                         aten::_to_copy         0.00%      20.000us        10.90%     262.283ms     262.283ms       3.000us         0.04%       2.133ms       2.133ms             1  
                                            aten::copy_        10.90%     262.254ms        10.90%     262.254ms     262.254ms       2.129ms        31.36%       2.129ms       2.129ms             1  
                                               aten::to         0.00%       8.000us        10.69%     257.116ms     257.116ms       2.000us         0.03%       1.820ms       1.820ms             1  
                                         aten::_to_copy         0.00%      20.000us        10.69%     257.108ms     257.108ms       4.000us         0.06%       1.818ms       1.818ms             1  
                                            aten::copy_        10.69%     257.078ms        10.69%     257.078ms     257.078ms       1.813ms        26.70%       1.813ms       1.813ms             1  
                                               aten::to         0.00%       7.000us        10.64%     255.998ms     255.998ms       2.000us         0.03%       1.963ms       1.963ms             1  
                                         aten::_to_copy         0.00%      18.000us        10.64%     255.991ms     255.991ms       3.000us         0.04%       1.961ms       1.961ms             1  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
Self CPU time total: 2.405s
Self CUDA time total: 6.789ms

Please help me find the issue. Thanks!

I would recommend to check out the performance guide, which would point to towards using e.g. torch.backends.cudnn.benchmark = True and to e.g. avoid synchronizations via item() calls.

1 Like