I am analyzing pytorch code on both CPU and GPU. I have written a simple Module that just passes the input tensor through a nn.Linear layer and returns the output. The code is as follows:
import torch
import torch.nn as nn
with torch.profiler.profile(
activities=[
torch.profiler.ProfilerActivity.CPU
]
) as profiler:
class LinearModel(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(100, 10)
def forward(self, inp):
torch.cuda.nvtx.range_push("LinearModel fc1 forward")
out = self.fc1(inp)
torch.cuda.nvtx.range_pop()
return out
model = LinearModel()
model.to(torch.device("cuda"))
A = torch.randn(100, device="cuda")
B = model(A)
torch.cuda.nvtx.range_push("Hypothesis")
print(B.shape)
torch.cuda.nvtx.range_pop()
print(profiler.events())
I run the script with perf as follows:
perf stat python3 main_with_torchprofile.py
The output generated by pytorch profiler is as follows
torch.Size([10])
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Name Self CPU % Self CPU CPU total % CPU total CPU time avg # of Calls
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
aten::empty 0.00% 8.000us 0.00% 8.000us 8.000us 1
aten::detach 0.00% 2.000us 0.00% 25.000us 25.000us 1
detach 0.00% 23.000us 0.00% 23.000us 23.000us 1
aten::empty 0.00% 2.000us 0.00% 2.000us 2.000us 1
aten::detach 0.00% 2.000us 0.00% 3.000us 3.000us 1
detach 0.00% 1.000us 0.00% 1.000us 1.000us 1
aten::uniform_ 0.04% 1.599ms 0.04% 1.599ms 1.599ms 1
aten::uniform_ 0.00% 3.000us 0.00% 3.000us 3.000us 1
cudaGetDeviceCount 0.02% 610.000us 0.02% 610.000us 610.000us 1
cudaGetDeviceCount 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaGetDeviceProperties 0.00% 88.000us 0.00% 88.000us 88.000us 1
cudaGetDeviceProperties 0.00% 93.000us 0.00% 93.000us 93.000us 1
aten::to 0.01% 557.000us 99.55% 3.976s 3.976s 1
aten::_to_copy 0.00% 20.000us 99.54% 3.976s 3.976s 1
aten::empty_strided 0.02% 845.000us 2.12% 84.711ms 84.711ms 1
cudaMalloc 2.10% 83.866ms 2.10% 83.866ms 83.866ms 1
aten::copy_ 0.06% 2.536ms 97.42% 3.891s 3.891s 1
cudaMemcpyAsync 97.35% 3.888s 97.35% 3.888s 3.888s 1
cudaStreamSynchronize 0.00% 4.000us 0.00% 4.000us 4.000us 1
aten::_has_compatible_shallow_copy_type 0.00% 0.000us 0.00% 0.000us 0.000us 1
aten::_has_compatible_shallow_copy_type 0.00% 0.000us 0.00% 0.000us 0.000us 1
aten::to 0.00% 7.000us 0.00% 49.000us 49.000us 1
aten::_to_copy 0.00% 6.000us 0.00% 42.000us 42.000us 1
aten::empty_strided 0.00% 12.000us 0.00% 12.000us 12.000us 1
aten::copy_ 0.00% 14.000us 0.00% 24.000us 24.000us 1
cudaMemcpyAsync 0.00% 7.000us 0.00% 7.000us 7.000us 1
cudaStreamSynchronize 0.00% 3.000us 0.00% 3.000us 3.000us 1
aten::_has_compatible_shallow_copy_type 0.00% 0.000us 0.00% 0.000us 0.000us 1
aten::_has_compatible_shallow_copy_type 0.00% 0.000us 0.00% 0.000us 0.000us 1
aten::randn 0.00% 44.000us 0.04% 1.526ms 1.526ms 1
aten::empty 0.00% 8.000us 0.00% 8.000us 8.000us 1
aten::normal_ 0.04% 1.459ms 0.04% 1.474ms 1.474ms 1
cudaLaunchKernel 0.00% 15.000us 0.00% 15.000us 15.000us 1
aten::linear 0.05% 2.134ms 0.35% 13.881ms 13.881ms 1
aten::t 0.00% 19.000us 0.02% 853.000us 853.000us 1
aten::transpose 0.00% 4.000us 0.02% 834.000us 834.000us 1
aten::as_strided 0.02% 830.000us 0.02% 830.000us 830.000us 1
aten::matmul 0.01% 491.000us 0.18% 7.002ms 7.002ms 1
aten::unsqueeze 0.01% 445.000us 0.01% 446.000us 446.000us 1
aten::as_strided 0.00% 1.000us 0.00% 1.000us 1.000us 1
aten::mm 0.14% 5.771ms 0.15% 6.056ms 6.056ms 1
cudaFree 0.00% 1.000us 0.00% 1.000us 1.000us 1
cudaFree 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaDeviceGetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaMalloc 0.00% 139.000us 0.00% 139.000us 139.000us 1
cudaMemcpy 0.00% 13.000us 0.00% 13.000us 13.000us 1
cudaFuncSetAttribute 0.00% 6.000us 0.00% 6.000us 6.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 1.000us 0.00% 1.000us 1.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
cudaFuncSetAttribute 0.00% 0.000us 0.00% 0.000us 0.000us 1
------------------------------------------------------- ------------ ------------ ------------ ------------ ------------ ------------
Self CPU time total: 3.994s
The self CPU total time is 3.994s. But the wall clock time shown by perf reads
Performance counter stats for 'python3 main_with_torchprofile.py':
4,619.85 msec task-clock # 0.758 CPUs utilized
5,352 context-switches # 1.158 K/sec
27 cpu-migrations # 5.844 /sec
854,841 page-faults # 185.037 K/sec
15,735,891,555 cycles # 3.406 GHz
22,051,243,307 instructions # 1.40 insn per cycle
4,097,984,812 branches # 887.039 M/sec
36,976,033 branch-misses # 0.90% of all branches
6.091185118 seconds time elapsed
2.419163000 seconds user
2.228566000 seconds sys
As can be seen on the task clock, the program spends 4.619s on a wall clock. Where is the 4.619 - 3.994 = 625ms(which is around 13.5% of the total time spent on the CPU) spent that is not considered by pytorch profiler?
Thanks
Edit
changed from 13.5% of the total wall clock time to 13.5% of the total time spent on the CPU