ValueError : Attemting to unscale fp16 Gradients

Can you help me figure out if there are any issues with my implementation specifically?

Model:

class BertClassifier(nn.Module):
    """
    Class defining the classifier model with a BERT encoder and a single fully connected classifier layer.
    """
    def __init__(self, dropout=0.5, num_labels=24):
        super(BertClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, num_labels)
        self.relu = nn.ReLU()
        self.best_score = 0

    def forward(self, input_id, mask):
        _, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
        output = self.relu(self.linear(self.dropout(pooled_output)))

        return output

Helper objects:

device = torch.device("cuda" if use_cuda else "cpu")
criterion = nn.CrossEntropyLoss().cuda() if use_cuda else nn.CrossEntropyLoss
# Set eps to 1e-04 to use float16
optimizer = Adam(model.parameters(), lr=learning_rate, eps=1e-04)
# Use scaler to use mixed precision (float16 and float32)
scaler = torch.cuda.amp.GradScaler()
# Use scheduler to reduce learning rate gradually
scheduler = ReduceLROnPlateau(optimizer, factor=0.5, patience=5)
if use_cuda:
    # use float16 to reduce GPU memory load
    model = model.cuda().to(dtype=torch.float16)

Training steps:

def forward_pass(auxiliaries, inputs, label):
    device, criterion, optimizer, scaler, model, _ = auxiliaries
    label = label.to(device)
    mask = inputs['attention_mask'].to(device)
    input_id = inputs['input_ids'].squeeze(1).to(device)
    with torch.cuda.amp.autocast():
        output = model(input_id, mask)
        loss = criterion(output, label)
    return loss

def backward_pass(auxiliaries, batch_loss):
    _, _, optimizer, scaler, model, _ = auxiliaries

    scaler.scale(batch_loss).backward()
    scaler.step(optimizer)
    scaler.update()
    optimizer.zero_grad()

def train_loop(auxiliaries, train_dataloader):
    for train_input, train_label in tqdm(train_dataloader):
        batch_loss = forward_pass(auxiliaries, train_input, train_label)
        backward_pass(auxiliaries, batch_loss)

Error trace:

Traceback (most recent call last):
  File "/bert_extraction/bert_extraction_main.py", line 27, in <module>
    ranked_train = train(model, df_train, df_val, ENV, label_converter)
  File "/bert_extraction/train_test.py", line 221, in train
    train_results = train_loop(auxiliaries, train_dataloader)
  File "/bert_extraction/train_test.py", line 140, in train_loop
    backward_pass(auxiliaries, batch_loss)
  File "/bert_extraction/train_test.py", line 119, in backward_pass
    scaler.step(optimizer)
  File "/anaconda/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py", line 334, in step
    self.unscale_(optimizer)
  File "/anaconda/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py", line 279, in unscale_
    optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False)
  File "/anaconda/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py", line 207, in _unscale_grads_
    raise ValueError("Attempting to unscale FP16 gradients.")
ValueError: Attempting to unscale FP16 gradients.