I converted my training loop to use AMP, and I notice my accuracy numbers are now all 0
. What needs to be changed to work with AMP? z
was calculated within a with_autocast():
clause so I thought it should be fine, but apparently it is not.
print("Training..............................\r", end='')
train_iter = iter(train_loader)
next_batch = train_iter.next()
next_batch = [_.cuda(non_blocking=True) for _ in next_batch ]
for idx in range(len(train_loader)):
image, meta, y=next_batch
if(param['cache_on']):
if(epoch == 0):
print(f"Loading Cache using train_loader {idx*train_loader.batch_size + image.shape[0]}\r", end='')
if idx + 1 != len(train_loader):
# start copying data of next batch
next_batch = train_iter.next()
next_batch = [ _.cuda(non_blocking=True) for _ in next_batch]
optim.zero_grad()
# AMP
with autocast():
z = model((image, meta))
y_smo = y.float() * (1 - param['label_smoothing']) + 0.5 * param['label_smoothing']
loss = criterion(z, y_smo.unsqueeze(1))
# Before AMP
# loss.backward()
# AMP
scaler.scale(loss).backward()
# Before AMP
# optim.step()
# AMP
scaler.step(optim)
scaler.update()
pred = torch.sigmoid(torch.round(z)) # round off sigmoid to obtain predictions
correct += (pred.cpu() == y.cpu().unsqueeze(1)).sum().item() # tracking number of correctly predicted samples
epoch_loss += loss.item()
train_acc = correct / len(train_idx)