BertForMaskedLM's loss and scores, how the loss is computed?

sanaz · August 4, 2020, 1:57am

I have a simple MaskedLM model with one masked token at position 7. The model returns 20.2516 and 18.0698 as loss and score respectively. However, not sure how the loss is computed from the score. I assumed the loss should be
loss = - log(softmax(score[prediction])
but computing this loss returns 0.0002. I’m confused about how the loss is computed in the model.

import copy
from transformers import BertForMaskedLM, BertTokenizerFast
import torch
model = BertForMaskedLM.from_pretrained('bert-base-uncased')
tokenizer = BertTokenizerFast.from_pretrained('bert-base-uncased')


text = "Who was Jim Paterson ? Jim Paterson is a doctor".lower()
inputs  =  tokenizer.encode_plus(text,  return_tensors="pt", add_special_tokens = True, truncation=True, pad_to_max_length = True,
                                         return_attention_mask = True,  max_length=64)
input_ids = inputs['input_ids']
masked  = copy.deepcopy(inputs['input_ids'])
masked[0][7] = 103
for t in range(len(masked[0])):
  if masked[0][t] != 103:
    masked[0][t] = -100
loss, scores = model(input_ids = input_ids, attention_mask = inputs['attention_mask'] , token_type_ids=inputs['token_type_ids'] , labels=masked)
print('loss',loss)
print(scores.shape)
pred = torch.argmax( scores[0][7]).item()
print("predicted token:", pred, tokenizer.convert_ids_to_tokens([pred])  )
print("score:", scores[0][7][pred]) 


logSoftmax = torch.nn.LogSoftmax(dim=1)
NLLLos = torch.nn.NLLLoss()
output = NLLLos( logSoftmax(torch.unsqueeze(logit[0][7], 0)), torch.tensor([pred]))
print(output)