I am trying blended precision for tensor operation, and find the runtime is not improved, and actually worse. Anything i missed?
torch.cuda.get_device_name()
‘Tesla T4’
import torch
import time
a = torch.zeros(60000, 200, dtype=torch.float16, device='cuda')
b = torch.tensor([0.01, 0.03, 0.05, 0.07], dtype=torch.float32, device='cuda')
for i in range(5):
start = time.time()
for j in range(1000):
a.normal_()
d = a * b.view(-1, 1, 1)
print(time.time() - start)
0.9754595756530762
1.9229979515075684
1.913642168045044
1.9154479503631592
1.9257619380950928
import torch
import time
a = torch.zeros(60000, 200, dtype=torch.float32, device='cuda')
b = torch.tensor([0.01, 0.03, 0.05, 0.07], dtype=torch.float32, device='cuda')
for i in range(5):
start = time.time()
for j in range(1000):
a.normal_()
d = a * b.view(-1, 1, 1)
print(time.time() - start)
0.910468339920044
1.8464961051940918
1.8373432159423828
1.841585397720337
1.8456008434295654