NER Bert Random behavior

Hello fellows ,I was working on Customer NER problem by fine-tuning base bert yet for the 4th time the model wasn’t learning any thing with total random loss values yet constant over epochs for each try , eg :
Epoch 0: Train Loss = 0.2952, Val Loss = 0.2944
Epoch 10: Train Loss = 0.2950, Val Loss = 0.2944

here is the Model class :
class NERBertModel(nn.Module):

def __init__(self, num_tag):
    super(NERBertModel, self).__init__()
    self.num_tag = num_tag
    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.bert_drop = nn.Dropout(0.3)
    self.out_tag = nn.Linear(768,self.num_tag)
    
def forward(self, ids, mask, token_type_ids, target_tags):
    output, _ = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
    bert_out = self.bert_drop(output) 
    tag = self.out_tag(bert_out)

    #Calculate the loss
    Critirion_Loss = nn.CrossEntropyLoss(ignore_index=-100)
    active_loss = mask.view(-1) == 1
    active_logits = tag.view(-1, self.num_tag)
    active_labels = torch.where(active_loss, target_tags.view(-1), torch.tensor(Critirion_Loss.ignore_index).type_as(target_tags))
    loss = Critirion_Loss(active_logits, active_labels)
    return tag, loss

hyper params :
#Function for getparameters
def get_hyperparameters(model, ff):

# ff: full_finetuning
if ff:
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "gamma", "beta"]
    optimizer_grouped_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.01,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay_rate": 0.0,
        },
    ]
else:
    param_optimizer = list(model.out_tag.named_parameters())
    optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

return optimizer_grouped_parameters

train\val_functions :

def train_fn(train_data_loader, model, optimizer, device, scheduler):
#Train the Model
model.train()
loss_ = 0
for data in tqdm(train_data_loader, total = len(train_data_loader)):
for i, j in data.items():
data[i] = j.to(device)

    #Backward Propagation
    optimizer.zero_grad()
    _, loss = model(**data)
    loss.backward()
    optimizer.step()
    scheduler.step()
    loss_ += loss.item()
return model, loss_ / len(train_data_loader)

def val_fn(val_data_loader, model, optimizer, device, scheduler):
model.eval()
loss_ = 0
for data in tqdm(val_data_loader, total = len(val_data_loader)):
for i, j in data.items():
data[i] = j.to(device)
, loss = model(**data)
loss
+= loss.item()
return loss_ / len(val_data_loader)

PS : the dataset is so inbalanced with the target class hardly being 10% of the labels totality