Hello fellows ,I was working on Customer NER problem by fine-tuning base bert yet for the 4th time the model wasn’t learning any thing with total random loss values yet constant over epochs for each try , eg :
Epoch 0: Train Loss = 0.2952, Val Loss = 0.2944
Epoch 10: Train Loss = 0.2950, Val Loss = 0.2944
here is the Model class :
class NERBertModel(nn.Module):
def __init__(self, num_tag):
super(NERBertModel, self).__init__()
self.num_tag = num_tag
self.bert = BertModel.from_pretrained('bert-base-uncased')
self.bert_drop = nn.Dropout(0.3)
self.out_tag = nn.Linear(768,self.num_tag)
def forward(self, ids, mask, token_type_ids, target_tags):
output, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
bert_out = self.bert_drop(output)
tag = self.out_tag(bert_out)
#Calculate the loss
Critirion_Loss = nn.CrossEntropyLoss(ignore_index=-100)
active_loss = mask.view(-1) == 1
active_logits = tag.view(-1, self.num_tag)
active_labels = torch.where(active_loss, target_tags.view(-1), torch.tensor(Critirion_Loss.ignore_index).type_as(target_tags))
loss = Critirion_Loss(active_logits, active_labels)
return tag, loss
hyper params :
#Function for getparameters
def get_hyperparameters(model, ff):
# ff: full_finetuning
if ff:
param_optimizer = list(model.named_parameters())
no_decay = ["bias", "gamma", "beta"]
optimizer_grouped_parameters = [
{
"params": [
p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
],
"weight_decay_rate": 0.01,
},
{
"params": [
p for n, p in param_optimizer if any(nd in n for nd in no_decay)
],
"weight_decay_rate": 0.0,
},
]
else:
param_optimizer = list(model.out_tag.named_parameters())
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]
return optimizer_grouped_parameters
train\val_functions :
def train_fn(train_data_loader, model, optimizer, device, scheduler):
#Train the Model
model.train()
loss_ = 0
for data in tqdm(train_data_loader, total = len(train_data_loader)):
for i, j in data.items():
data[i] = j.to(device)
#Backward Propagation
optimizer.zero_grad()
_, loss = model(**data)
loss.backward()
optimizer.step()
scheduler.step()
loss_ += loss.item()
return model, loss_ / len(train_data_loader)
def val_fn(val_data_loader, model, optimizer, device, scheduler):
model.eval()
loss_ = 0
for data in tqdm(val_data_loader, total = len(val_data_loader)):
for i, j in data.items():
data[i] = j.to(device)
, loss = model(**data)
loss += loss.item()
return loss_ / len(val_data_loader)
PS : the dataset is so inbalanced with the target class hardly being 10% of the labels totality