I’m having the same issue on the A6000:
import torch
from torch.cuda.amp import autocast
from transformers import AutoModel
model = AutoModel.from_pretrained('bert-base-uncased').to(1)
model.train()
x=torch.randint(0,1000, (8,128)).to(1)
def autocast_forward(model, x, fast_dtype=None):
if fast_dtype is None:
return model.forward(x)
with autocast(fast_dtype=fast_dtype):
return model.forward(x)
import torch.utils.benchmark as benchmark
t0 = benchmark.Timer(
stmt='autocast_forward(model, x)',
setup='from __main__ import autocast_forward',
globals={'x': x, 'model':model})
t1 = benchmark.Timer(
stmt='autocast_forward(model, x, fast_dtype=torch.float16)',
setup='from __main__ import autocast_forward',
globals={'x': x, 'model':model})
t2 = benchmark.Timer(
stmt='autocast_forward(model, x, fast_dtype=torch.bfloat16)',
setup='from __main__ import autocast_forward',
globals={'x': x, 'model':model})
print(t0.timeit(1000)) # prints 8.91 ms
print(t1.timeit(1000)) # prints 9.18 ms
print(t2.timeit(1000)) # prints 9.13 ms
My goal is to use bf16, not fp16, but the issue is there for both
@ptrblck is there any new insight on this?