I am trying profile the advantage of using torch.compile vs using the default eager mode on inference and using the torch profile
to measure the cuda
total time and cpu
total time.
My expectation was that because of the compile once and launch multiple times advantage of using torch.compile
, I should see overall reduction in cpu
total time in torch.compile
and then also the cudaKernelLaunch
time could also be potentially lower in the compiled graph mode because of more opportunities for fusion.
The logs don’t show the same. Here’s the code I am trying -
import torch
import torch._dynamo
from torch.profiler import profile, record_function, ProfilerActivity
from transformers import GPT2LMHeadModel, \
pipeline, \
AutoTokenizer, \
AutoModelForCausalLM, \
LlamaForCausalLM
gen_model = 'meta-llama/Llama-3.2-1B'
tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path=gen_model)
tokenizer.pad_token = tokenizer.eos_token
# Generates random input and targets data for the model, where `b` is
# batch size.
def generate_data(b, max_length=512):
# Generate b random strings (or you can provide your own text data)
generator = pipeline('text-generation', model=gen_model, device='cuda')
sentences_generated = [item['generated_text']
for item in generator("Generate something about",
max_length=max_length, num_return_sequences=b)]
encodings = tokenizer(sentences_generated, return_tensors='pt',
padding=True, truncation=True, max_length=max_length)
# Move the tokenized input data to GPU
input_ids = encodings['input_ids'].to(torch.int64).cuda() # token ids
attention_mask = encodings['attention_mask'].to(torch.int64).cuda() # attention mask
# No labels required in this case, but you could return labels if needed
return input_ids, attention_mask
def init_model():
model = AutoModelForCausalLM.from_pretrained(gen_model)
# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
return model
# Usage
model = init_model()
# Reset since we are using a different mode.
torch._dynamo.reset()
# Compile the model
model_opt = torch.compile(model, mode="reduce-overhead")
inp_data = generate_data(10)
with torch.no_grad():
with profile(activities=[ProfilerActivity.CUDA], profile_memory=True) as prof:
with record_function("model_inference"):
model(inp_data[0], inp_data[1])
print(prof.key_averages().table())
with torch.no_grad():
with profile(activities=[ProfilerActivity.CUDA], profile_memory=True) as prof:
with record_function("model_inference"):
model_opt(inp_data[0], inp_data[1])
print(prof.key_averages().table())
with torch.no_grad():
# Running the compiled model multiple times and profiling each run
for run_idx in range(4):
print(f"--- Profiling Run {run_idx + 1} ---")
# Start profiling
with profile(activities=[ProfilerActivity.CUDA, ProfilerActivity.CPU], profile_memory=True) as prof:
with record_function("model_inference"):
model_opt(inp_data[0], inp_data[1])
# Print profiling data
print(prof.key_averages().table(sort_by="cuda_time_total", row_limit=10))
The output after eager execution
is -
Self CPU time total: 1.261s
Self CUDA time total: 2.423s
Whereas, here are the times after the compiled execution -
Run 1
Self CPU time total: 1.623s
Self CUDA time total: 3.955s
Run 2, 3, 4, and 5 are in the similar ballpark numbers as follows:
Self CPU time total: 1.129s
Self CUDA time total: 2.257s
Am I missing something here that the expectation to have lower CPU time and CUDA time is wrong?
Second - when using torch.compile
vs eager
mode, is there a way to evaluate whether the compile mode led to any operator fusion (and subsequent optimization) underneath?