Mixed precision training is so slow when deterministic=True

Hi folks, is there a way to set deterministic=True without get slow training using mixed precision ? is it a bug?

Here are my benchmarks using RTX 3070 (pytorch 1.10 - cuda 11.3):
deterministic=False 43.032ms per iter vs
deterministic=True 354.005ms per iter

import torch
import torch.nn as nn
from torch.cuda.amp import autocast, GradScaler
import time
from torchvision.models import mobilenet_v2

class model(nn.Module):
    def __init__(self):
        super(model, self).__init__()
        self.mobilenet = mobilenet_v2()
        self.linear = nn.Sequential(nn.Linear(1000, 250),
                                    nn.Linear(250, 64),
                                    nn.Linear(64, 32),
                                    nn.Linear(32, 10),
                                    nn.ReLU())

    def forward(self, x):
        out = self.mobilenet(x)
        out = self.linear(out)
        return out

x = torch.randn(32, 3, 224, 224).cuda(0)
y = torch.randint(0, 10, (32,)).cuda(0)
m = model().cuda(0)
criterion = nn.CrossEntropyLoss()
optim = torch.optim.Adam(m.parameters(), 1e-4)
scaler = GradScaler()

nb_iters = 100

def train16(nb_iters=100, deterministic=False):
    torch.backends.cudnn.deterministic = deterministic
    torch.cuda.synchronize()
    t0 = time.time()
    for _ in range(nb_iters):
        optim.zero_grad()
        with autocast(enabled=True):
            output = m(x)
            loss = criterion(output, y)
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()
    loss.detach()
    torch.cuda.synchronize()
    t1 = time.time()
    print('{:.3f}ms per iter'.format((t1 - t0)/nb_iters * 1000.))

train16(nb_iters, deterministic=True)
train16(nb_iters)

Thanks in advance!

No, as deterministic algorithms could yield a slower performance to guarantee deterministic results due to the internal kernel implementation.