Hi folks, is there a way to set deterministic=True without get slow training using mixed precision ? is it a bug?
Here are my benchmarks using RTX 3070 (pytorch 1.10 - cuda 11.3):
deterministic=False 43.032ms per iter vs
deterministic=True 354.005ms per iter
import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
import time
from torchvision.models import mobilenet_v2
class model(nn.Module):
def __init__(self):
super(model, self).__init__()
self.mobilenet = mobilenet_v2()
self.linear = nn.Sequential(nn.Linear(1000, 250),
nn.Linear(250, 64),
nn.Linear(64, 32),
nn.Linear(32, 10),
nn.ReLU())
def forward(self, x):
out = self.mobilenet(x)
out = self.linear(out)
return out
x = torch.randn(32, 3, 224, 224).cuda(0)
y = torch.randint(0, 10, (32,)).cuda(0)
m = model().cuda(0)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(m.parameters(), 1e-4)
scaler = GradScaler()
nb_iters = 100
def train16(nb_iters=100, deterministic=False):
torch.backends.cudnn.deterministic = deterministic
torch.cuda.synchronize()
t0 = time.time()
for _ in range(nb_iters):
optim.zero_grad()
with autocast(enabled=True):
output = m(x)
loss = criterion(output, y)
scaler.scale(loss).backward()
scaler.step(optim)
scaler.update()
loss.detach()
torch.cuda.synchronize()
t1 = time.time()
print('{:.3f}ms per iter'.format((t1 - t0)/nb_iters * 1000.))
train16(nb_iters, deterministic=True)
train16(nb_iters)
Thanks in advance!