I have a model that I apply to 3D data. To provide an example I created a random dataset that always return the same tensor of the wanted size.
When executing the code below, the .cuda()
command is extremely slow (4s) except the first call (0.008s).
If I comment loss.backward()
it becomes much faster (0.008s).
I also put below the reports of torch.utils.bottleneck with and without loss.backward().
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from time import time
class RandomDataset(Dataset):
def __init__(self, size=(1, 169, 208, 179), length=18):
self.item = torch.rand(size)
self.length = length
def __len__(self):
return self.length
def __getitem__(self, idx):
return self.item
class Test(nn.Module):
"""
Classifier for a multi-class classification task
"""
def __init__(self):
super(Test, self).__init__()
self.features = nn.Sequential(
nn.Conv3d(1, 8, 3),
nn.BatchNorm3d(8),
nn.ReLU(),
nn.MaxPool3d(2, 2),
nn.Conv3d(8, 16, 3),
nn.BatchNorm3d(16),
nn.ReLU(),
nn.MaxPool3d(2, 2),
nn.Conv3d(16, 32, 3),
nn.BatchNorm3d(32),
nn.ReLU(),
nn.MaxPool3d(2, 2),
nn.Conv3d(32, 64, 3),
nn.BatchNorm3d(64),
nn.ReLU(),
nn.MaxPool3d(2, 2)
)
self.classifier = nn.Sequential(
nn.Linear(64 * 792, 1000),
nn.ReLU(),
nn.Linear(1000, 100),
nn.ReLU(),
nn.Linear(100, 2)
)
def forward(self, x):
x = self.features(x)
x = x.view(x.size(0), -1)
x = self.classifier(x)
return x
if __name__ == "__main__":
batch_size = 3
dataset = RandomDataset()
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, drop_last=True)
model = Test()
model = model.cuda()
total_time=0
criterion = nn.CrossEntropyLoss()
for i, data in enumerate(dataloader):
t0 = time()
data_gpu = data.cuda()
t1 = time()
total_time += t1 - t0
print("Loading data on GPU", t1 - t0)
output = model(data_gpu)
labels = torch.Tensor([0] * batch_size).long().cuda()
loss = criterion(output, labels)
loss.backward()
print("Mean time on loading data on GPU:", total_time / (len(dataset) / batch_size))
With loss.backward()
--------------------------------------------------------------------------------
Environment Summary
--------------------------------------------------------------------------------
PyTorch 1.0.0 compiled w/ CUDA 8.0.61
Running with Python 3.6 and
`pip3 list` truncated output:
numpy (1.14.3)
--------------------------------------------------------------------------------
cProfile output
--------------------------------------------------------------------------------
8396 function calls (7632 primitive calls) in 23.731 seconds
Ordered by: internal time
List reduced from 268 to 15 due to restriction <15>
ncalls tottime percall cumtime percall filename:lineno(function)
46 22.976 0.499 22.976 0.499 {method 'cuda' of 'torch._C._TensorBase' objects}
18 0.317 0.018 0.317 0.018 {method 'uniform_' of 'torch._C._TensorBase' objects}
18 0.259 0.014 0.259 0.014 {built-in method addmm}
6 0.086 0.014 0.086 0.014 {built-in method stack}
1 0.042 0.042 0.042 0.042 {built-in method rand}
6 0.018 0.003 0.018 0.003 {method 'run_backward' of 'torch._C._EngineBase' objects}
24 0.005 0.000 0.005 0.000 {built-in method conv3d}
24 0.002 0.000 0.002 0.000 {built-in method batch_norm}
36 0.002 0.000 0.002 0.000 {built-in method threshold}
1 0.002 0.002 23.731 23.731 script.py:1(<module>)
404 0.002 0.000 0.004 0.000 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py:537(__setattr__)
24 0.002 0.000 0.002 0.000 {built-in method torch._C._nn.max_pool3d_with_indices}
24 0.001 0.000 0.004 0.000 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py:58(forward)
150/12 0.001 0.000 0.276 0.023 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py:483(__call__)
1224/1003 0.001 0.000 0.004 0.000 {built-in method builtins.isinstance}
without loss.backward()
--------------------------------------------------------------------------------
Environment Summary
--------------------------------------------------------------------------------
PyTorch 1.0.0 compiled w/ CUDA 8.0.61
Running with Python 3.6 and
`pip3 list` truncated output:
numpy (1.14.3)
--------------------------------------------------------------------------------
cProfile output
--------------------------------------------------------------------------------
8336 function calls (7572 primitive calls) in 4.048 seconds
Ordered by: internal time
List reduced from 263 to 15 due to restriction <15>
ncalls tottime percall cumtime percall filename:lineno(function)
46 3.318 0.072 3.318 0.072 {method 'cuda' of 'torch._C._TensorBase' objects}
18 0.312 0.017 0.312 0.017 {method 'uniform_' of 'torch._C._TensorBase' objects}
18 0.258 0.014 0.258 0.014 {built-in method addmm}
6 0.082 0.014 0.082 0.014 {built-in method stack}
1 0.040 0.040 0.040 0.040 {built-in method rand}
24 0.006 0.000 0.006 0.000 {built-in method conv3d}
36 0.004 0.000 0.004 0.000 {built-in method threshold}
24 0.003 0.000 0.003 0.000 {built-in method batch_norm}
24 0.003 0.000 0.003 0.000 {built-in method torch._C._nn.max_pool3d_with_indices}
404 0.002 0.000 0.004 0.000 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py:537(__setattr__)
1 0.002 0.002 4.048 4.048 script.py:1(<module>)
150/12 0.001 0.000 0.281 0.023 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py:483(__call__)
24 0.001 0.000 0.005 0.000 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py:58(forward)
100/1 0.001 0.000 0.003 0.003 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/abc.py:196(__subclasscheck__)
24/1 0.001 0.000 1.952 1.952 /home/elina.thibeausutre/miniconda3/envs/pytorch/lib/python3.6/site-packages/torch/nn/modules/module.py:185(_apply)