Torch.cuda.amp and Accuracy

I converted my training loop to use AMP, and I notice my accuracy numbers are now all 0. What needs to be changed to work with AMP? z was calculated within a with_autocast(): clause so I thought it should be fine, but apparently it is not.

        print("Training..............................\r", end='') 
        train_iter = iter(train_loader)
        next_batch = train_iter.next()
        next_batch = [_.cuda(non_blocking=True) for _ in next_batch ]
        for idx in range(len(train_loader)):
            image, meta, y=next_batch
            if(param['cache_on']):
                if(epoch == 0):
                    print(f"Loading Cache using train_loader {idx*train_loader.batch_size + image.shape[0]}\r", end='')   

            if idx + 1 != len(train_loader): 
                # start copying data of next batch
                next_batch = train_iter.next()
                next_batch = [ _.cuda(non_blocking=True) for _ in next_batch]


            optim.zero_grad()
            # AMP
            with autocast():
                z = model((image, meta))
                y_smo = y.float() * (1 - param['label_smoothing']) + 0.5 * param['label_smoothing']
                loss = criterion(z, y_smo.unsqueeze(1))

            # Before AMP
#             loss.backward()
            # AMP
            scaler.scale(loss).backward()


            # Before AMP
#             optim.step()
            # AMP
            scaler.step(optim)
            scaler.update()
            
            pred = torch.sigmoid(torch.round(z)) # round off sigmoid to obtain predictions  
            correct += (pred.cpu() == y.cpu().unsqueeze(1)).sum().item()  # tracking number of correctly predicted samples
            epoch_loss += loss.item()
        train_acc = correct / len(train_idx)