Why Is my BERT model's loss function not decreasing?

I am trying to solve a NLP problem on where we have to decide whether r two sentences(premise and hypothesis) are either not related, related, or opposite.

I have used pretrained BERT model from hugging face as my model.
here is my model

class BertModel(nn.Module):
    
    def __init__(self,model):
        super(BertModel,self).__init__()
        self.bert_model = model
        self.dropout = nn.Dropout(0.3)
        self.linear = nn.Linear(768*2,3)
    
    def forward(self,xb):
        o1,_ = self.bert_model(xb) 
        apool = torch.mean(o1, 1)
        mpool, _ = torch.max(o1, 1)
        cat = torch.cat((apool, mpool), 1)
        x = self.dropout(cat)
        return self.linear(x)

this is my dataloader

class DataTokenizer(Dataset):
    
    def __init__(self,data,model_path,max_len,text_transform=None,one_hot=False):
        
        self.max_len = max_len
        self.text = data[['premise','hypothesis']].values.tolist()
        self.text_transform = text_transform
        
        if one_hot:
            labels = pd.get_dummies(data.label)
        else:
            labels = data.label
            
        self.labels = labels.values.tolist()
            
        self.tokenizer = AutoTokenizer.from_pretrained(model_path)
    
    def get_tokens(self,text):
        
        encode = self.tokenizer.encode_plus(
            text,
            max_length = self.max_len,
            pad_to_max_length=True
        )
        return encode['input_ids']
    
    def __len__(self):
        return len(self.text)
    
    
    def __getitem__(self,idx):
        text = self.text[idx]
        
        tokens = self.get_tokens(text)
        tokens = torch.tensor(tokens)
        
        label = self.labels[idx]
        label = torch.tensor(label,dtype=torch.float)

        return tokens, label

and here is my training loop.

def run():
    
    losses = list()
    loss_fn = nn.CrossEntropyLoss()
    
    def train_loop(train_loader,model,optimizer,loss_fn,device,scheduler=None):
        
        model.train()
        for i, batch in (enumerate(train_loader)):
            #getting the data from data loader
            input_tokens , labels = batch
        
            # putting the data in device
            input_tokens = input_tokens.to(device, dtype=torch.long)
            labels = labels.to(device, dtype= torch.long)
        
            optimizer.zero_grad()
            output = model(input_tokens)
            
            loss = loss_fn(output,labels)
            losses.append(loss)
            
            if i % 10 == 0:
                print(f'i={i}, loss={loss}')
#                 print(output,labels)

            
            loss.backward()
            optimizer.step()

            if scheduler is not None:
                scheduler.step()

                
    lr = 1e-5
    MAX_LEN = 100
    EPOCHS = 10
    batch_size = 32
    
    train_ds = DataTokenizer(train_data,MODEL_PATH,MAX_LEN)
    
    train_dl = torch.utils.data.DataLoader(
        train_ds,
        batch_size=batch_size,
        shuffle=True,
        num_workers=4,
        drop_last=True,
    )
    

    device = torch.device('cuda')
    model = mx.to(device)
    
    num_train_steps = int(len(train_ds) / batch_size/ EPOCHS)

    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    
    print('train_start')
    for i in range(EPOCHS):

        train_loop(train_dl,model,optimizer,loss_fn,device,scheduler=None)
    torch.save(model.state_dict(),'model.bin')
    
    return losses
    

I ran this model for 10 epochs for learning rate of 1e-3, 1e-4, 1e-5.

but my loss is always around 1.05 for 10 epochs. and the output from the model is always around [0.33,0.33.0.33] like random guessing.

I want to know the reason why my model is not training.