GPU utilisation is low but memory is usage high. This is my code:
from tqdm import tqdm
import torchvision.transforms as transforms
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import CIFAR100
import torch.nn as nn
from torch.functional import split
import torch
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
class VGG(nn.Module):
def __init__(self, in_channels=3, num_classes=1000):
super(VGG, self).__init__()
self.in_channels = in_channels
self.conv_layers = self.create_conv_layers(VGG16)
self.fcs = nn.Sequential(
nn.Linear(512*7*7, 4096),
nn.ReLU(),
nn.Dropout(p=0.5),
nn.Linear(4096, 4096),
nn.ReLU(), nn.Dropout(p=0.6), nn.Linear(4096, num_classes)
)
def forward(self, x):
x = self.conv_layers(x)
x = x.reshape(x.shape[0], -1)
x = self.fcs(x)
return x
def create_conv_layers(self, architecture):
layers = []
in_channels = self.in_channels
for x in architecture:
if type(x) == int:
out_channels = x
layers += [nn.Conv2d(in_channels=in_channels, out_channels=out_channels,
kernel_size=(3, 3), stride=(1, 1), padding=(1, 1)),
nn.BatchNorm2d(x),
nn.ReLU()
]
in_channels = out_channels
else:
layers += [nn.MaxPool2d(kernel_size=(2, 2), stride=(2, 2))]
return nn.Sequential(*layers)
if __name__ == '__main__':
VGG16 = [64, 64, 'M', 128, 128, 'M', 256, 256, 256,
'M', 512, 512, 512, 'M', 512, 512, 512, 'M']
batch_size = 32
lr = 0.0001
device = "cuda"
num_epochs = 1
transform = transforms.Compose(
[transforms.ToTensor(), transforms.Resize((224, 224))])
train_dataset = CIFAR100(
root='dataset/', train=True, transform=transform, download=True)
train_loader = DataLoader(dataset=train_dataset,
batch_size=batch_size, shuffle=True, num_workers=6, pin_memory=True)
model = VGG().to(device)
optimizer = optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
for epoch in range(num_epochs):
losses = []
with tqdm(train_loader, unit="Batch", desc=f"Epoch {epoch + 1}") as tepoch:
num_batches = len(tepoch)
for batch_idx, (data, targets) in enumerate(tepoch):
data = data.to(device=device)
targets = targets.to(device=device)
preds = model(data)
loss = criterion(preds, targets)
losses.append(loss.item())
optimizer.zero_grad()
loss.backward()
optimizer.step()
I used the pytorch profiler. This is what is got:
--------------------------------------------------------------------------------
Environment Summary
--------------------------------------------------------------------------------
PyTorch 1.10.0 DEBUG compiled w/ CUDA 11.3
Running with Python 3.8 and CUDA 11.2.67
`pip3 list` truncated output:
mypy-extensions==0.4.3
numpy==1.20.1
numpydoc==1.1.0
torch==1.10.0
torch-tb-profiler==0.3.1
torchaudio==0.10.0
torchvision==0.11.1
--------------------------------------------------------------------------------
cProfile output
--------------------------------------------------------------------------------
6012080 function calls (5926529 primitive calls) in 631.801 seconds
Ordered by: internal time
List reduced from 1550 to 15 due to restriction <15>
ncalls tottime percall cumtime percall filename:lineno(function)
3223 379.244 0.118 379.244 0.118 {method 'to' of 'torch._C._TensorBase' objects}
1565 204.858 0.131 204.858 0.131 {method 'item' of 'torch._C._TensorBase' objects}
12 12.053 1.004 12.054 1.005 {method 'dump' of '_pickle.Pickler' objects}
20319 8.883 0.000 8.883 0.000 {built-in method conv2d}
1563 8.339 0.005 8.339 0.005 {method 'run_backward' of 'torch._C._EngineBase' objects}
4689 2.900 0.001 2.900 0.001 {built-in method torch._C._nn.linear}
1563 1.661 0.001 6.370 0.004 C:\Users\techi\anaconda3\lib\site-packages\torch\optim\_functional.py:54(adam)
20319 1.608 0.000 1.608 0.000 {built-in method batch_norm}
181308 1.338 0.000 1.338 0.000 {method 'mul_' of 'torch._C._TensorBase' objects}
181308 1.272 0.000 1.272 0.000 {method 'add_' of 'torch._C._TensorBase' objects}
90654 0.751 0.000 0.751 0.000 {method 'sqrt' of 'torch._C._TensorBase' objects}
90654 0.689 0.000 0.689 0.000 {method 'addcdiv_' of 'torch._C._TensorBase' objects}
23445 0.648 0.000 0.648 0.000 {built-in method relu}
90654 0.595 0.000 0.595 0.000 {method 'addcmul_' of 'torch._C._TensorBase' objects}
32 0.569 0.018 0.569 0.018 {method 'uniform_' of 'torch._C._TensorBase' objects}
--------------------------------------------------------------------------------
autograd profiler output (CUDA mode)
--------------------------------------------------------------------------------
top 15 events sorted by cpu_time_total
Because the autograd profiler uses the CUDA event API,
the CUDA time column reports approximately max(cuda_time, cpu_time).
Please ignore this output if your code does not use CUDA.
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg Self CUDA Self CUDA % CUDA total CUDA time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Optimizer.step#Adam.step 20.72% 498.367ms 20.91% 503.092ms 503.092ms 611.000us 9.00% 328.025ms 328.025ms 1
aten::uniform_ 18.01% 433.238ms 18.01% 433.238ms 433.238ms 1.000us 0.01% 1.000us 1.000us 1
enumerate(DataLoader)#_MultiProcessingDataLoaderIter... 14.69% 353.386ms 14.69% 353.390ms 353.390ms 2.000us 0.03% 3.000us 3.000us 1
Optimizer.zero_grad#Adam.zero_grad 13.09% 314.864ms 13.12% 315.547ms 315.547ms 77.000us 1.13% 317.186ms 317.186ms 1
aten::to 0.00% 10.000us 11.90% 286.182ms 286.182ms 2.000us 0.03% 2.141ms 2.141ms 1
aten::_to_copy 0.00% 19.000us 11.90% 286.172ms 286.172ms 4.000us 0.06% 2.139ms 2.139ms 1
aten::copy_ 11.90% 286.142ms 11.90% 286.142ms 286.142ms 2.134ms 31.43% 2.134ms 2.134ms 1
aten::to 0.00% 7.000us 10.90% 262.290ms 262.290ms 2.000us 0.03% 2.135ms 2.135ms 1
aten::_to_copy 0.00% 20.000us 10.90% 262.283ms 262.283ms 3.000us 0.04% 2.133ms 2.133ms 1
aten::copy_ 10.90% 262.254ms 10.90% 262.254ms 262.254ms 2.129ms 31.36% 2.129ms 2.129ms 1
aten::to 0.00% 8.000us 10.69% 257.116ms 257.116ms 2.000us 0.03% 1.820ms 1.820ms 1
aten::_to_copy 0.00% 20.000us 10.69% 257.108ms 257.108ms 4.000us 0.06% 1.818ms 1.818ms 1
aten::copy_ 10.69% 257.078ms 10.69% 257.078ms 257.078ms 1.813ms 26.70% 1.813ms 1.813ms 1
aten::to 0.00% 7.000us 10.64% 255.998ms 255.998ms 2.000us 0.03% 1.963ms 1.963ms 1
aten::_to_copy 0.00% 18.000us 10.64% 255.991ms 255.991ms 3.000us 0.04% 1.961ms 1.961ms 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 2.405s
Self CUDA time total: 6.789ms
Please help me find the issue. Thanks!