Hi,
I am trying to calculate GPU memory bandwidth for Huggingface Llama model inference. I want to calculate GPU memory bandwidth for this line of code:
**outputs = model.generate(input_ids, generation_config=generation_config)**
I am using these two functions to measure bandwidth but the memory bandwidth I am getting is very low.
Can you please guide me what might be wrong in my calculation? is the max_memory or the bandwidth calculated correctly?
Thanks
def inference():
with torch.no_grad():
outputs = model.generate(input_ids, generation_config=generation_config)
return outputs
def measure_bandwidth(func):
torch.cuda.reset_peak_memory_stats()
torch.cuda.synchronize()
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
start.record()
result = func()
end.record()
torch.cuda.synchronize()
elapsed_time = start.elapsed_time(end) / 1000 # Convert to seconds
max_memory = torch.cuda.max_memory_allocated() / (1024*1024*1024)
bandwidth = max_memory / elapsed_time # GB/s
return result, bandwidth
outputs, bandwidth = measure_bandwidth(inference)
print(f"Estimated GPU memory bandwidth usage: {bandwidth:.2f} GB/s")