Sorry for not being clear. Here are some examples that I ran on a GTX1080TI with PyTorch 1.8.
Maybe I am understanding something wrong about asynchronous execution.
from timeit import default_timer as timer
import torch
from torch import nn, jit
from torch.distributions import Normal
from tqdm import tqdm
class Timer:
def __init__(self, name=None):
self.name = name
self.start = None
self.end = None
def __enter__(self):
self.start = timer()
def __exit__(self, *args, **kwargs):
end = timer()
duration = end - self.start
print("DURATION:", self.name, duration)
Example 1: Simple multiplication
with Timer("kernel complete"):
input_cpu = torch.rand(32, 512, 512).pin_memory()
parameter_cpu = torch.rand(32, 512, 512, requires_grad=True).pin_memory()
with Timer("kernel launch"):
for i in tqdm(range(512)):
input_gpu = input_cpu.to("cuda", non_blocking=True)
parameter_gpu = parameter_cpu.to("cuda", non_blocking=True)
result = input_gpu * parameter_gpu
torch.cuda.synchronize()
DURATION: kernel launch 0.040197614999669895
DURATION: kernel complete 14.410601890999715
Example 2: Simple multiplication with backward pass
with Timer("kernel complete"):
input_cpu = torch.rand(32, 512, 512).pin_memory()
parameter_cpu = torch.rand(32, 512, 512, requires_grad=True).pin_memory()
with Timer("kernel launch + cpu->gpu transfer"):
with Timer("kernel launch"):
for i in tqdm(range(512)):
input_gpu = input_cpu.to("cuda", non_blocking=True)
parameter_gpu = parameter_cpu.to("cuda", non_blocking=True)
result = input_gpu * parameter_gpu
result.mean().backward()
input_gpu = input_cpu.to("cuda", non_blocking=True)
parameter_gpu = parameter_cpu.to("cuda", non_blocking=True)
torch.cuda.synchronize()
DURATION: kernel launch 5.375270492999334
DURATION: kernel launch + cpu->gpu transfer 5.3755872629990336
DURATION: kernel complete 6.740538608999486
Example 3: torch.distributions.Normal
.sample()
seems to block:
with Timer("kernel complete"):
input_cpu = torch.rand(32, 512, 512).pin_memory()
parameter_cpu = torch.rand(32, 512, 512, requires_grad=True).pin_memory()
with Timer("kernel launch"):
for i in tqdm(range(512)):
input_gpu = input_cpu.to("cuda", non_blocking=True)
parameter_gpu = parameter_cpu.to("cuda", non_blocking=True)
result = Normal(input_gpu, 1).sample() * parameter_gpu
torch.cuda.synchronize()
DURATION: kernel launch 5.955725089000225
DURATION: kernel complete 7.688505447999887
Example 4: torch.normal()
launches quicker?
with Timer("kernel complete"):
input_cpu = torch.rand(32, 512, 512).pin_memory()
parameter_cpu = torch.rand(32, 512, 512, requires_grad=True).pin_memory()
with Timer("kernel launch"):
for i in tqdm(range(512)):
input_gpu = input_cpu.to("cuda", non_blocking=True)
parameter_gpu = parameter_cpu.to("cuda", non_blocking=True)
result = (torch.normal(input_gpu, std=1)) * parameter_gpu
torch.cuda.synchronize()
DURATION: kernel launch 1.862492633000329
DURATION: kernel complete 7.237628412999584
Example 5: log_prob
also blocks?
with Timer("kernel complete"):
input_cpu = torch.rand(32, 512, 512).pin_memory()
parameter_cpu = torch.rand(32, 512, 512, requires_grad=True).pin_memory()
with Timer("kernel launch"):
for i in tqdm(range(512)):
input_gpu = input_cpu.to("cuda", non_blocking=True)
parameter_gpu = parameter_cpu.to("cuda", non_blocking=True)
result = Normal(input_gpu, 1).log_prob(parameter_gpu)
torch.cuda.synchronize()
DURATION: kernel launch 6.612539947000187
DURATION: kernel complete 8.380056750000222