I’d be grateful for help understanding the following result, which appears to show GPU 0 taking vastly longer than the other GPUs to perform a simple matrix multiply. I’m synchronizing and randomizing order, but the effect is huge, persistent, and reproducible on multiple machines, increasing as the size of the matrix increases.
Here is output and complete code:
OUTPUT:
/root/.virtualenvs/a1/bin/python /workspace/a1/src/debug.py
torch version: 2.5.1+cu124
torch CUDA version: 12.4
torch cuDNN version: 90100
size: 1000
GPU 7: 80.3 ms
GPU 6: 10.1 ms
GPU 3: 10.1 ms
GPU 5: 10.1 ms
GPU 1: 10.1 ms
GPU 4: 10.1 ms
GPU 0: 9.8 ms
GPU 2: 10.2 ms
size: 10000
GPU 2: 0.3 ms
GPU 5: 0.3 ms
GPU 3: 0.3 ms
GPU 0: 25.7 ms
GPU 4: 0.3 ms
GPU 6: 0.3 ms
GPU 1: 0.3 ms
GPU 7: 0.3 ms
size: 20000
GPU 6: 0.4 ms
GPU 0: 190.6 ms
GPU 2: 0.4 ms
GPU 7: 0.4 ms
GPU 4: 0.5 ms
GPU 3: 0.4 ms
GPU 5: 0.4 ms
GPU 1: 0.5 ms
size: 40000
GPU 3: 2.9 ms
GPU 1: 0.7 ms
GPU 4: 0.7 ms
GPU 7: 0.8 ms
GPU 0: 1536.5 ms
GPU 6: 0.8 ms
GPU 5: 0.7 ms
GPU 2: 0.7 ms
Process finished with exit code 0
CODE:
import random
import torch
def gpu_test(logical_gpus, size):
print(f"\nsize: {size}")
random.shuffle(logical_gpus) # Shuffle in case order affects results
for gpu in logical_gpus:
# Create a clean CUDA context
torch.cuda.empty_cache()
torch.cuda.device(f"cuda:{gpu}")
start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
x = torch.randn(size, size, device=f"cuda:{gpu}")
# Time matmul() in isolation
torch.cuda.synchronize()
start.record()
torch.matmul(x, x.t())
torch.cuda.synchronize()
end.record()
torch.cuda.synchronize()
print(f"GPU {gpu}: {start.elapsed_time(end):.1f} ms")
def experiments():
gpus = list(range(torch.cuda.device_count()))
for size in [1000, 10000, 20000, 40000]:
gpu_test(gpus, size)
if __name__ == "__main__":
print(f"torch version: {torch.version.__version__}")
print(f"torch CUDA version: {torch.version.cuda}")
print(f"torch cuDNN version: {torch.backends.cudnn.version()}")
print()
experiments()
Note that nvtop shows no other processes running and exactly the same memory used for each of the GPUs prior to execution.
Thanks in advance for any guidance!