Why F1 score is 0 for each epoch whilst fine-tuning DISTILBERT with PyTorch

Hi,

I am trying to fine-tune DISTILBERT with PyTorch and am wondering why the F1 score is 0 for every epoch. The purpose is to predict the category for our call log messages. Appreciate any help...

Sample format of my input dataset (of multi-class labels) follows:  
  
|     text       |   target  | 
| -------------- | --------- |
| call log msg0  |      0    |
| call log msg1  |      1    |
| call log msg2  |      2    |
| call log msg3  |      3    |
| call log msg4  |      4    |
and so on...
The F1 scores per epoch:  
Epoch [1/7], Train Loss: -1.7155 and Validation f1 0.0000  
Epoch [2/7], Train Loss: -7.2223 and Validation f1 0.0000  
Epoch [3/7], Train Loss: -10.0720 and Validation f1 0.0000  
Epoch [4/7], Train Loss: -20.7667 and Validation f1 0.0000  
Epoch [5/7], Train Loss: -30.4122 and Validation f1 0.0000  
Epoch [6/7], Train Loss: -28.9554 and Validation f1 0.0000  
    fit(7, model, loss_fn, optimizer, train_dl,valid_dl)

    def fit(num_epochs, model, loss_fn, opt, train_dl, valid_dl):
        for epoch in range(num_epochs):
            model.train()
            for _,data in enumerate(train_dl, 0):
                ids = data['ids'].to(device, dtype = torch.long) #text
                mask = data['mask'].to(device, dtype = torch.long)
                targets = data['targets'].to(device, dtype = torch.float)
                outputs = model(ids, mask).squeeze()
                loss = loss_fn(outputs, targets)
                loss.backward()
                opt.step()
                opt.zero_grad()    
            valid_acc = eval_fn(valid_dl, model)
            print('Epoch [{}/{}], Train Loss: {:.4f} and Validation f1 {:.4f}'.format(epoch+1, num_epochs, loss.item(),valid_acc))
    
    def eval_fn(data_loader, model):
        model.eval()
        fin_targets = [] #I think fin for final
        fin_outputs = []
        with torch.no_grad():
            for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
                ids = d["ids"]
                mask = d["mask"]
                targets = d["targets"]
                ids = ids.to(device, dtype=torch.long)
                mask = mask.to(device, dtype=torch.long)
                targets = targets.to(device, dtype=torch.float)
                outputs = model(ids=ids, mask=mask)
                #print("outputs",outputs)
                #tensor([[0.4337],[0.3716],[0.3827],[0.3683],[0.3661],[0.3927],[0.3947],[0.3413],[0.3994],[0.3700],[0.3617]])
                fin_targets.extend(targets.cpu().detach().numpy().tolist())
                #fin_outputs.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())
                fin_outputs.extend(torch.softmax(outputs,dim=1).tolist())
                #fin_outputs b4 tolist() is applied
                #[tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.]), tensor([1.])]            
                print('fin_outputs', fin_outputs)
                #[[1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0], [1.0]]
        #fin_outputs = np.array(fin_outputs) >= 0.5
        f1 = metrics.f1_score(fin_targets, fin_outputs,average="micro") #
        # parameters for metrics.f1_score
        # y_true: 1d array-like ( Ground truth (correct) target values )
        # y_pred: 1d array-like ( Estimated targets as returned by a classifier )
        # average: micro ( Calculate metrics globally by counting the total true positives, false negatives and false positives )   
        return f1