Since cuda is initialized lazily in pytorch, the first time you use it, it has a higher runtime.
Also when doing timings in cuda, you need to manually synchronize because the cuda api is asynchronous.
import torch
import time
m = 998
n = 2473
a = torch.randn(m*n).cuda()
torch.cuda.synchronize()
st = time.time()
r = torch.dot(a,a)
torch.cuda.synchronize()
ed = time.time()
print("initial run")
print(ed - st)
torch.cuda.synchronize()
st = time.time()
r = torch.dot(a,a)
torch.cuda.synchronize()
ed = time.time()
print("normal run")
print(ed - st)
torch.cuda.synchronize()
st = time.time()
r = torch.dot(a,a)
torch.cuda.synchronize()
ed = time.time()
print("normal run")
print(ed - st)