Calculate accuracy for Transformer model

I am having problems calculating the training accuracy of my model. I extracted the logits and applied softmax onto it before calculating the accuracy. The accuracy is increasing but the numbers are, 1182.91, 2409.33, 3661.98, 4927.61, 6197.20 for 5 epochs. The function for measuring the accuracy is as follows :

def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

And the training epoch code is as follows:

loss_values = []
train_acc = 0
# For each epoch...
for epoch_i in range(0, epochs):
    print("")
    print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))
    print('Training...')

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_loss = 0

    model.train()

    #correct_prediction= 0
    history = defaultdict(list)
    for step, batch in enumerate(train_dataloader):

        # Progress update every 100 batches.
        if step % 50 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)
            print('  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed))

        # `batch` contains three pytorch tensors: [0]: input ids ,[1]: attention masks,[2]: labels 
        b_input_ids = batch[0].to(device)
        b_input_mask = batch[1].to(device)
        b_labels = batch[2].to(device)

        # Clear any previously calculated gradients.
        # (source: https://stackoverflow.com/questions/48001598/why-do-we-need-to-call-zero-grad-in-pytorch)
        model.zero_grad()        

        # Evaluate the model on this training batch.
        outputs = model(b_input_ids, 
                    token_type_ids=None, 
                    attention_mask=b_input_mask, 
                    labels=b_labels)
        loss = outputs[0]
        #logits = outputs[1]
        #_, preds = torch.max(outputs, dim=1)
        # Accumulate the training loss over all of the batches 
        total_loss += loss.item()
        #correct_predictions += torch.sum(preds == b_labels)
        loss.backward()
        # Clip the norm of the gradients to 1.0. to prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()

        # Update the learning rate.
        scheduler.step()
        logits = outputs.logits
        logits = F.softmax(logits, dim= -1)
        #logits= torch.nn.Softmax(logits)
        logits = logits.detach().cpu().numpy()
        
        labels = b_labels.to('cpu').numpy()
        tmp_train_accuracy = flat_accuracy(logits, labels)
        train_acc += tmp_train_accuracy
    # Calculate the average loss over the training data.

    avg_train_loss = total_loss / len(train_dataloader)   
             
    # train_acc= correct_predictions.double() / len(train_dataloader)

    history['train_loss'].append(loss.item())
    history['train_acc'].append(train_acc.item())
    # Store the loss value for plotting the learning curve.
    #loss_values.append(avg_train_loss)
    
    print("")
    print("Training accuracy: {}".format(train_acc))
    print("  Average training loss:{0:.2f}".format(avg_train_loss))
    print("  Training epoch took: {:}".format(format_time(time.time() - t0)))

   
#    plt.ylim([0, 1]);
      
print("")
print("Training complete!")

Is there any mistakes in my logit calculation or in the accuracy function? Any thoughts?

Hi @the_coder
It looks like flat_accuracy function returns accuracy scaled into 0 - 1, but train_acc is being accumulated by train_acc += tmp_train_accuracy, that is not rescaled later

Thanks for your input @yoshitomo-matsubara , you are right, this is causing the problem.