I don’t know what I’m doing wrong, but my FP16 and BF16 bench are way slower than FP32 and TF32 modes. Here are my results with the 2 GPUs at my disposal (RTX 2060 Mobile, RTX 3090 Desktop):
Benching precision speed on a NVIDIA GeForce RTX 2060
benching FP32…
epoch 0 took 13.9146514s
epoch 1 took 11.6350846s
epoch 2 took 11.867831299999999sbenching FP16…
epoch 0 took 15.745933399999998s
epoch 1 took 16.212830699999998s
epoch 2 took 16.495791399999987s
Benching precision speed on a NVIDIA GeForce RTX 3090
benching FP32…
epoch 0 took 5.7641565s
epoch 1 took 4.0729165s
epoch 2 took 4.0790243sbenching TF32…
epoch 0 took 4.042242200000002s
epoch 1 took 4.0321663s
epoch 2 took 4.080792600000002sbenching FP16…
epoch 0 took 5.053079000000004s
epoch 1 took 5.029029299999998s
epoch 2 took 4.973819899999995sbenching BF16…
epoch 0 took 11.721234800000005s
epoch 1 took 11.542296499999999s
epoch 2 took 11.566654600000007s
And here’s the file I made to generate the bench, a simple MNIST classifier that you can easily run on your own computer (I run it with PyTorch 1.12 on Windows and the latest NVIDIA drivers):
#MNIST example inspired by https://github.com/pytorch/examples/blob/main/mnist/main.py
import timeit
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
dataset = torchvision.datasets.MNIST(root = 'data', train = True, transform = torchvision.transforms.ToTensor(), download = True)
loader = torch.utils.data.DataLoader(dataset, batch_size=64, pin_memory=True, shuffle=True)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)
def forward(self, x):
x = self.conv1(x)
x = F.relu(x)
x = self.conv2(x)
x = F.relu(x)
x = F.max_pool2d(x, 2)
x = self.dropout1(x)
x = torch.flatten(x, 1)
x = self.fc1(x)
x = F.relu(x)
x = self.dropout2(x)
x = self.fc2(x)
output = F.log_softmax(x, dim=1)
return output
device = torch.device("cuda")
def bench(mode, epochs=3):
print("benching "+mode+"...")
torch.backends.cuda.matmul.allow_tf32 = True if mode == 'TF32' else False
scaler = torch.cuda.amp.GradScaler(enabled=True if mode == 'FP16' else False)
model = Net().to(device)
optimizer = torch.optim.Adadelta(model.parameters(), lr=1.0)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=1, gamma=0.7)
model.train()
for epoch in range(epochs):
start_time = timeit.default_timer()
model.train()
for batch_idx, (data, target) in enumerate(loader):
data, target = data.to(device), target.to(device)
optimizer.zero_grad()
with torch.autocast(device_type='cuda', dtype=torch.bfloat16 if mode == 'BF16' else torch.float16, enabled=True if '16' in mode else False):
output = model(data)
loss = F.nll_loss(output, target)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
scheduler.step()
torch.cuda.synchronize()
end_time = timeit.default_timer()
print("epoch "+str(epoch)+" took "+str(end_time-start_time)+"s")
print("")
print("Benching precision speed on a "+torch.cuda.get_device_name(0))
print("")
bench(mode="FP32")
if torch.cuda.is_bf16_supported():
bench(mode="TF32")
bench(mode="FP16")
if torch.cuda.is_bf16_supported():
bench(mode="BF16")