Apparently later examples work and optimized well, not sure what i did wrong in my example:
import torch
Returns the result of running fn()
and the time it took for fn()
to run,
in seconds. We use CUDA events and synchronization for the most accurate
measurements.
def timed(fn):
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
result = fn()
end.record()
torch.cuda.synchronize()
return result, start.elapsed_time(end) / 1000
Generates random input and targets data for the model, where b
is
batch size.
def generate_data(b):
return (
torch.randn(b, 3, 128, 128).to(torch.float32).cuda(),
torch.randint(1000, (b,)).cuda(),
)
N_ITERS = 10
from torchvision.models import densenet121
def init_model():
return densenet121().to(torch.float32).cuda()
model = init_model()
Reset since we are using a different mode.
import torch._dynamo
torch._dynamo.reset()
model_opt = torch.compile(model, mode=βreduce-overheadβ)
inp = generate_data(16)[0]
eager_times =
for i in range(N_ITERS):
inp = generate_data(16)[0]
with torch.no_grad():
_, eager_time = timed(lambda: model(inp))
eager_times.append(eager_time)
print(f"eager eval time {i}: {eager_time}")
print(β~β * 10)
compile_times =
for i in range(N_ITERS):
inp = generate_data(16)[0]
with torch.no_grad():
_, compile_time = timed(lambda: model_opt(inp))
compile_times.append(compile_time)
print(f"compile eval time {i}: {compile_time}β)
print(β~" * 10)
import numpy as np
eager_med = np.median(eager_times)
compile_med = np.median(compile_times)
speedup = eager_med / compile_med
assert(speedup > 1)
print(f"(eval) eager median: {eager_med}, compile median: {compile_med}, speedup: {speedup}x")
print(β~β * 10)
python3 ex-3-original-2.py
eager eval time 0: 1.8496431884765625
eager eval time 1: 0.02884818458557129
eager eval time 2: 0.026322099685668947
eager eval time 3: 0.026348180770874023
eager eval time 4: 0.02663505935668945
eager eval time 5: 0.02669969940185547
eager eval time 6: 0.026537784576416014
eager eval time 7: 0.033738910675048826
eager eval time 8: 0.027683704376220704
eager eval time 9: 0.026418901443481444
/usr/local/lib/python3.10/dist-packages/torch/_inductor/compile_fx.py:135: UserWarning: TensorFloat32 tensor cores for float32 matrix multiplication available but not enabled. Consider setting `torch.set_float32_matmul_precision('high')` for better performance.
warnings.warn(
compile eval time 0: 78.0921796875
compile eval time 1: 0.785571533203125
compile eval time 2: 0.0111019287109375
compile eval time 3: 0.010863529205322266
compile eval time 4: 0.010895689010620118
compile eval time 5: 0.01093185043334961a
compile eval time 6: 0.011064648628234863
compile eval time 7: 0.01088080883026123
compile eval time 8: 0.011026410102844239
compile eval time 9: 0.010890089035034179
(eval) eager median: 0.02666737937927246, compile median: 0.010979130268096923, speedup: 2.4289154721811013x