PyTorch time profiling weird behavior

I have two neural networks (one is deeper than the other). I try to profile time in PyTorch to see how they behave differently in GPU. I have below code snippets to profile my program.

import torch
from torch import cuda
from torch.utils.data import DataLoader

import torchvision.transforms as transforms
from torchvision.datasets import FakeData

def profile():
    dataset = FakeData(size=10000, image_size=(3,224,224), transform=transforms.ToTensor())
    data_loader = DataLoader(dataset, num_workers=2, batch_size=60)

    net = torch.load(...).cuda()
    net.eval()

    computation_cost, communication_cost = 0, 0
    with torch.no_grad():
        for images, labels in data_loader:
            # profiling communication time
            start = cuda.Event(enable_timing=True)
            start.record()
            images = images.cuda()
            labels = labels.cuda()
            end = cuda.Event(enable_timing=True)
            end.record()
            cuda.synchronize()
            communication_cost += start.elapsed_time(end)

            # profiling computation time
            start = cuda.Event(enable_timing=True)
            start.record()
            _ = net(images)
            end = cuda.Event(enable_timing=True)
            end.record()
            cuda.synchronize()
            computation_cost += start.elapsed_time(end)

I got following results from this program.

Deep network, computation: 22679 ms, communication: 1342 ms
Shallow network, computation: 3180 ms, communication: 1374 ms

Based on this result, I expect the shallow network program total runtime is also close to 15% of deep network program total runtime.

I used below slightly modified program to profile the total runtime.

import torch
from torch import cuda
from torch.utils.data import DataLoader

import torchvision.transforms as transforms
from torchvision.datasets import FakeData

def profile():
    dataset = FakeData(size=10000, image_size=(3,224,224), transform=transforms.ToTensor())
    data_loader = DataLoader(dataset, num_workers=2, batch_size=60)

    net = torch.load(...).cuda()
    net.eval()

    start = cuda.Event(enable_timing=True)
    start.record()

    with torch.no_grad():
        for images, labels in data_loader
            images = images.cuda()
            labels = labels.cuda()
            _ = net(images)

    end = cuda.Event(enable_timing=True)
    end.record()
    cuda.synchronize()

    total = start.elapsed_time(end)

But the result is not what I expect.

Deep network, total: 25551 ms
Shallow network, total: 20691 ms

Do I do anything wrong in the second profiling program?