I have two neural networks (one is deeper than the other). I try to profile time in PyTorch to see how they behave differently in GPU. I have below code snippets to profile my program.
import torch
from torch import cuda
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import FakeData
def profile():
dataset = FakeData(size=10000, image_size=(3,224,224), transform=transforms.ToTensor())
data_loader = DataLoader(dataset, num_workers=2, batch_size=60)
net = torch.load(...).cuda()
net.eval()
computation_cost, communication_cost = 0, 0
with torch.no_grad():
for images, labels in data_loader:
# profiling communication time
start = cuda.Event(enable_timing=True)
start.record()
images = images.cuda()
labels = labels.cuda()
end = cuda.Event(enable_timing=True)
end.record()
cuda.synchronize()
communication_cost += start.elapsed_time(end)
# profiling computation time
start = cuda.Event(enable_timing=True)
start.record()
_ = net(images)
end = cuda.Event(enable_timing=True)
end.record()
cuda.synchronize()
computation_cost += start.elapsed_time(end)
I got following results from this program.
Deep network, computation: 22679 ms, communication: 1342 ms
Shallow network, computation: 3180 ms, communication: 1374 ms
Based on this result, I expect the shallow network program total runtime is also close to 15% of deep network program total runtime.
I used below slightly modified program to profile the total runtime.
import torch
from torch import cuda
from torch.utils.data import DataLoader
import torchvision.transforms as transforms
from torchvision.datasets import FakeData
def profile():
dataset = FakeData(size=10000, image_size=(3,224,224), transform=transforms.ToTensor())
data_loader = DataLoader(dataset, num_workers=2, batch_size=60)
net = torch.load(...).cuda()
net.eval()
start = cuda.Event(enable_timing=True)
start.record()
with torch.no_grad():
for images, labels in data_loader
images = images.cuda()
labels = labels.cuda()
_ = net(images)
end = cuda.Event(enable_timing=True)
end.record()
cuda.synchronize()
total = start.elapsed_time(end)
But the result is not what I expect.
Deep network, total: 25551 ms
Shallow network, total: 20691 ms
Do I do anything wrong in the second profiling program?