Why I get worse accuracy when using BERT?

I’m using the following code

**Import the Libraries : **

! pip install transformers

import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModel, BertTokenizer, AdamW
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as pp

**Prepare the Data : **

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_set = pd.read_csv('/content/drive/My Drive/train_set.csv')
val_set = pd.read_csv('/content/drive/My Drive/val_set.csv')
X_train, y_train = train_set['text'], train_set['label']
X_val, y_val = val_set['text'], val_set['label']

# compute the class weights
class_wts = compute_class_weight('balanced', pp.unique(y_train), y_train)

# convert class weights to tensor
weights = torch.tensor(class_wts, dtype=torch.float)
weights = weights.to(device)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train = tokenizer.batch_encode_plus(
    X_train.tolist(),
    truncation=True, return_token_type_ids=False, return_length=True, pad_to_max_length=30,
    return_attention_mask=True)


train_label = torch.tensor(train_set['label'].tolist(), dtype=torch.long)

train_length = torch.tensor(X_train['length']) # I will not use this In the training for now 
train_input = torch.tensor(X_train['input_ids'])
train_attention_mask = torch.tensor(X_train['attention_mask'])
train_set = TensorDataset(train_input, train_length, train_attention_mask, train_label)


X_val = tokenizer.batch_encode_plus(
    X_val.tolist(),
    truncation=True, return_token_type_ids=False, return_length=True, pad_to_max_length=30, return_attention_mask=True)

val_label = torch.tensor(val_set['label'].tolist(), dtype=torch.long)
val_length = torch.tensor(X_val['length']) # I will not use this In the training for now 
val_input = torch.tensor(X_val['input_ids'])
val_attention_mask = torch.tensor(X_val['attention_mask'])
val_set = TensorDataset(val_input, val_length, val_attention_mask, val_label)

train_loader = DataLoader(train_set, batch_size=50, sampler=RandomSampler(train_set))
val_loader = DataLoader(val_set, batch_size=50, sampler=SequentialSampler(val_set))

**Training the model : **

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        for param in self.bert.parameters():
            param.requires_grad = False
        self.linear = nn.Linear(self.bert.config.hidden_size, 200)
        self.drop = nn.Dropout(.5)
        self.output = nn.Linear(200, 2)

    def forward(self, x, mask, len):
        _, x = self.bert(x, attention_mask=mask)
        x = self.drop(x)
        x = nn.functional.relu(self.linear(x))
        x = self.output(x)
        return x

model = BERT()

model.to(device)
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

optimizer = AdamW(model.parameters(), lr=.01)
cross_entropy = nn.NLLLoss(weight=weights).to(device)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1, patience=2, verbose=True)


def big_training_loop(epochs, optim, clf, lossf, train_loader, val_loader, accuracy_score):
    # variables for early stopping
    n_epochs_stop = 10
    epochs_no_improve = 0
    min_val_loss = float('inf')

    # running epoch
    for epoch in range(1, epochs + 1):

        # variables for performance monitoring
        loss_train_list = []
        loss_val_list = []
        correct_train_list_fscroe = []
        correct_train_list = []
        correct_val_list_fscroe = []
        correct_val_list = []

        # training on batches
        for (text, lens, mask, labels), (val_text, val_lens, val_mask, val_labels) in zip(train_loader, val_loader):
            # - training section - #
            clf.train()
            text, lens, mask, labels = text.to(device),lens , mask.to(device), labels.to(device)
            output = clf(text, mask, lens)
            # compute loss function
            loss = lossf(output, labels)
            # append loss output to loss_train_list for mnitoring
            loss_train_list.append(loss.item())

            # convert tensors to numpy array
            labels = labels.cpu().detach().numpy()
            output = torch.argmax(output, dim=1).cpu().detach().numpy()
            # calculate accuracy score
            correct_train_list_fscroe.append(f1_score(output, labels, average='macro'))
            correct_train_list.append(accuracy_score(output, labels))
            # compute back propagation
            loss.backward()
            # update weights
            optim.zero_grad()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()
            # clean gradients to not accumulate

            # - evaluation section - #
            clf.eval()
            with torch.no_grad():
                # - training section - #
                val_text, val_lens, val_mask, val_labels = val_text.to(device), val_lens.to(device), val_mask.to(device), val_labels.to(device)
                val_output = clf(val_text, val_mask, val_lens).squeeze()

                # compute loss function
                val_loss = lossf(val_output, val_labels)
                # append loss output to loss_val_list for mnitoring
                loss_val_list.append(val_loss.item())

                # convert tensors to numpy array
                val_output = torch.argmax(val_output, dim=1).cpu().detach().numpy()
                val_labels = val_labels.cpu().detach().numpy()

                # calculate accuracy score
                correct_val_list.append(accuracy_score(val_output, val_labels))
                correct_val_list_fscroe.append(f1_score(val_output, val_labels, average='macro'))
              
            # change lr if the loss didn't decrease

            loss = torch.mean(torch.FloatTensor(loss_train_list))
            val_loss = torch.mean(torch.FloatTensor(loss_val_list))
            acc = torch.mean(torch.FloatTensor(correct_train_list)) * 100
            fscore = torch.mean(torch.FloatTensor(correct_train_list_fscroe)) * 100
            val_acc = torch.mean(torch.FloatTensor(correct_val_list)) * 100
            val_fscore = torch.mean(torch.FloatTensor(correct_val_list_fscroe)) * 100

        scheduler.step(val_loss)

        # save best model if the loss is the best
        if torch.mean(torch.FloatTensor(loss_val_list)) < min_val_loss:
          # variables for best performance
          best_epoch = epoch
          best_loss = loss
          best_val_loss = val_loss
          best_acc = acc
          best_fscore = fscore
          best_val_acc = val_acc
          best_val_fscore = val_fscore
          # save best model
          epochs_no_improve = 0
          min_val_loss = best_val_loss
          # print the current epoch as the best epoch

          print(
              f'BEST EPOCH: Epoch({epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')

        else:
            # print the current epoch as normal epoch
            print(
                f'Epoch({epoch}) -> Train: (Accuracy: {acc:.1f}, f-score: {fscore:.1f}, Loss: {loss:.4f}) | Val: (Accuracy: {val_acc:.1f}, f-score: {val_fscore:.1f},Loss: {val_loss:.4f})')
            # if epochs_no_improve reached n_epochs_stop the training will stop
            epochs_no_improve += 1

        # early stop the training
        if epoch > 5 and epochs_no_improve == n_epochs_stop:
            torch.save(clf, f'clf_val_loss_{best_val_loss:.4f}_f-score_{best_val_fscore:.1f}_val_acc_{best_val_acc:.1f}.pt')
            print('Early stopping!')
            print()
            print(
                f'BEST EPOCH: Epoch({best_epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
            break

print('Running...')
big_training_loop(555, optimizer, model, cross_entropy, train_loader, val_loader, accuracy_score=accuracy_score)
Running...
BEST EPOCH: Epoch(1) -> Train: (Accuracy: 28.4, f-score: 25.2, Loss: -0.0608) | Val: (Accuracy: 31.3, f-score: 23.5, Loss: -0.0249)
Epoch(2) -> Train: (Accuracy: 28.0, f-score: 25.4, Loss: -0.0458) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(3) -> Train: (Accuracy: 32.5, f-score: 29.5, Loss: -0.0696) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(4) -> Train: (Accuracy: 36.0, f-score: 33.0, Loss: -0.0618) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(5) -> Train: (Accuracy: 31.1, f-score: 27.0, Loss: -0.0599) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(6) -> Train: (Accuracy: 35.1, f-score: 31.0, Loss: -0.0712) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(7) -> Train: (Accuracy: 34.0, f-score: 31.3, Loss: -0.0641) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(8) -> Train: (Accuracy: 34.7, f-score: 30.9, Loss: -0.0722) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(9) -> Train: (Accuracy: 31.8, f-score: 28.1, Loss: -0.0570) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(10) -> Train: (Accuracy: 35.3, f-score: 31.3, Loss: -0.0598) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(11) -> Train: (Accuracy: 38.0, f-score: 33.8, Loss: -0.0860) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Early stopping!

BEST EPOCH: Epoch(1) -> Train: (Accuracy: 28.4, f-score: 25.2, Loss: -0.0608) | Val: (Accuracy: 31.3, f-score: 23.5, Loss: -0.0249)

I don’t know why the accuracy is so Low; I tried many loss functions.

The way bert is implemented is inspired from this tutorial

Why would you freeze the model params? Yes, the model is pretrained, but it has to be fine-tuned on the downstream task. Without that, it will act as a random classifier most likely. And for the loss function, use CrossEntropyLoss. Btw, instead of defining you own BERT for sequence classification, you can use the built-in BertForSequenceClassification from HuggingFace Transformers.

1 Like

I changed the classifier to :

self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased')
and for the forward function:
x = self.bert(x, attention_mask=mask)

but I got another problem which is that the output of self.bert is
a tuple containing a tensor and I want to extract the tensor from that tuple:

(tensor([[0.6515, 0.6653],
        [0.2553, 0.1504],
        [0.4994, 0.4797],
        [0.6337, 0.2966],
        [0.2209, 0.5274],
        [0.5424, 0.3485],
        [0.3630, 0.1887],
        [0.2545, 0.7262],
        [0.2021, 0.4187],
        [0.5003, 0.1696],
        [0.4143, 0.5789],
        [0.4960, 0.4392],
        [0.4121, 0.5264],
        [0.5776, 0.4939],
        [0.3785, 0.4943],
        [0.4546, 0.4430],
        [0.6860, 0.3314],
        [0.4527, 0.3885],
        [0.2221, 0.6505],
        [0.3507, 0.2968],
        [0.3292, 0.6263],
        [0.3475, 0.3593],
        [0.1828, 0.5570],
        [0.5490, 0.1212],
        [0.2501, 0.2750],
        [0.3481, 0.4373],
        [0.3475, 0.7435],
        [0.4671, 0.2719],
        [0.3621, 0.4031],
        [0.3909, 0.2389],
        [0.3114, 0.5124],
        [0.1240, 0.3718],
        [0.1745, 0.2922],
        [0.3455, 0.4789],
        [0.4206, 0.3542],
        [0.2758, 0.2613],
        [0.2718, 0.2825],
        [0.2359, 0.4479],
        [0.5177, 0.3300],
        [0.5761, 0.4069],
        [0.5057, 0.5138],
        [0.1883, 0.3227],
        [0.6071, 0.4967],
        [0.4363, 0.3138],
        [0.2681, 0.2961],
        [0.3116, 0.4401],
        [0.2646, 0.8440],
        [0.1167, 0.4557],
        [0.3569, 0.2142],
        [0.4936, 0.2005]], device='cuda:0'),)

and here is the error :

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-360b9852dfa7> in <module>()
    137 
    138 print('Running...')
--> 139 big_training_loop(555, optimizer, model, cross_entropy, train_loader, val_loader, accuracy_score=accuracy_score)
    140 

5 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
   1686         if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
   1687             return handle_torch_function(linear, tens_ops, input, weight, bias=bias)
-> 1688     if input.dim() == 2 and bias is not None:
   1689         # fused op is marginally faster
   1690         ret = torch.addmm(bias, input, weight.t())

AttributeError: 'tuple' object has no attribute 'dim'

how should I fix it ?

I fixed this issue by doing

        with torch.no_grad():
            x = self.bert(x)[0]

but still get worse accuracy :frowning_face:

Did you remove the lines I copied below and change the loss function to torch.nn.CrossEntropyLoss?

for param in self.bert.parameters():
    param.requires_grad = False

yes.
do you have an example about how to implement BERT ?
I have more then 14 days trying to do it from all kinds of tutorials but I faced a lot of troubles

here is another script from different tutorial with the same problem…

Import the Libraries:

from transformers import BertTokenizer, BertForSequenceClassification
import torch, time
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import f1_score, accuracy_score

import random
import numpy as np
import pandas as pd
from torchtext import data
from torchtext.data import TabularDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Prepare Data:

model_class, tokenizer_class, pretrained_weights = (BertForSequenceClassification, BertTokenizer, 'bert-base-cased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
df_train = pd.read_csv("/content/drive/My Drive/train_set.csv")
df_val = pd.read_csv("/content/drive/My Drive/val_set.csv")
df = pd.concat([df_train, df_val], axis=0)

tokenized = df_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class Dataset(torch.utils.data.Dataset):
  def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_masks
        self.labels = labels

  def __len__(self):
        return len(self.input_ids)

  def __getitem__(self, index):
        input_ids = self.input_ids[index]
        attention_mask = self.attention_mask[index]
        labels = self.labels[index]
        return input_ids, attention_mask, labels

train_input_ids = torch.tensor(padded[:len(df_val)])  
train_attention_mask = torch.tensor(attention_mask[:len(df_val)])
train_labels = list(df['label'][:len(df_val)])

validation_input_ids = torch.tensor(padded[len(df_val):])  
validation_attention_mask = torch.tensor(attention_mask[len(df_val):])
validation_labels = list(df['label'][len(df_val):])


dataset = Dataset(input_ids=train_input_ids, attention_masks=train_attention_mask, labels=train_labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, drop_last=False)

validation_dataset = Dataset(input_ids=validation_input_ids, attention_masks=validation_attention_mask, labels=validation_labels)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=8, drop_last=False)

Train the model:

optimizer = torch.optim.AdamW(model.parameters(), lr=.01)
cross_entropy = nn.CrossEntropyLoss().to(device)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1, patience=2, verbose=True)

model.to(device)
def big_training_loop(epochs, optim, clf, lossf, train_loader, val_loader, accuracy_score):
    # variables for early stopping
    n_epochs_stop = 10
    epochs_no_improve = 0
    min_val_loss = float('inf')

    # running epoch
    for epoch in range(1, epochs + 1):

        # variables for performance monitoring
        loss_train_list = []
        loss_val_list = []
        correct_train_list_fscroe = []
        correct_train_list = []
        correct_val_list_fscroe = []
        correct_val_list = []

        # training on batches
        for (text, mask, labels), (val_text, val_mask, val_labels) in zip(train_loader, val_loader):
            # - training section - #
            clf.train()
            text, mask, labels = text.to(device), mask.to(device), labels.to(device)
            output = model(text, token_type_ids=None, attention_mask=mask)[0]
            # compute loss function
            loss = lossf(output, labels)
            # append loss output to loss_train_list for mnitoring
            loss_train_list.append(loss.item())

            # convert tensors to numpy array
            labels = labels.cpu().detach().numpy()
            output = torch.argmax(output, dim=1).cpu().detach().numpy()
            # calculate accuracy score
            correct_train_list_fscroe.append(f1_score(output, labels, average='macro'))
            correct_train_list.append(accuracy_score(output, labels))
            # compute back propagation
            loss.backward()
            # update weights
            optim.zero_grad()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()
            # clean gradients to not accumulate

            # - evaluation section - #
            clf.eval()
            with torch.no_grad():
                # - training section - #
                val_text, val_mask, val_labels = val_text.to(device), val_mask.to(device), val_labels.to(device)
                val_output = model(val_text, token_type_ids=None, attention_mask=val_mask)[0]
                # compute loss function
                val_loss = lossf(val_output, val_labels)
                # append loss output to loss_val_list for mnitoring
                loss_val_list.append(val_loss.item())

                # convert tensors to numpy array
                val_output = torch.argmax(val_output, dim=1).cpu().detach().numpy()
                val_labels = val_labels.cpu().detach().numpy()

                # calculate accuracy score
                correct_val_list.append(accuracy_score(val_output, val_labels))
                correct_val_list_fscroe.append(f1_score(val_output, val_labels, average='macro'))

            # change lr if the loss didn't decrease

            loss = torch.mean(torch.FloatTensor(loss_train_list))
            val_loss = torch.mean(torch.FloatTensor(loss_val_list))
            acc = torch.mean(torch.FloatTensor(correct_train_list)) * 100
            fscore = torch.mean(torch.FloatTensor(correct_train_list_fscroe)) * 100
            val_acc = torch.mean(torch.FloatTensor(correct_val_list)) * 100
            val_fscore = torch.mean(torch.FloatTensor(correct_val_list_fscroe)) * 100

        scheduler.step(val_loss)

        # save best model if the loss is the best
        if torch.mean(torch.FloatTensor(loss_val_list)) < min_val_loss:
            # variables for best performance
            best_epoch = epoch
            best_loss = loss
            best_val_loss = val_loss
            best_acc = acc
            best_fscore = fscore
            best_val_acc = val_acc
            best_val_fscore = val_fscore
            # save best model
            epochs_no_improve = 0
            min_val_loss = best_val_loss
            # print the current epoch as the best epoch

            print(
                f'BEST EPOCH: Epoch({epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')

        else:
            # print the current epoch as normal epoch
            print(
                f'Epoch({epoch}) -> Train: (Accuracy: {acc:.1f}, f-score: {fscore:.1f}, Loss: {loss:.4f}) | Val: (Accuracy: {val_acc:.1f}, f-score: {val_fscore:.1f},Loss: {val_loss:.4f})')
            # if epochs_no_improve reached n_epochs_stop the training will stop
            epochs_no_improve += 1

        # early stop the training
        if epoch > 5 and epochs_no_improve == n_epochs_stop:
            torch.save(clf,
                       f'clf_val_loss_{best_val_loss:.4f}_f-score_{best_val_fscore:.1f}_val_acc_{best_val_acc:.1f}.pt')
            print('Early stopping!')
            print()
            print(
                f'BEST EPOCH: Epoch({best_epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
            break


print('Running...')
big_training_loop(555, optimizer, model, cross_entropy, dataloader, validation_dataloader, accuracy_score=accuracy_score)
Running...
BEST EPOCH: Epoch(1) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6451) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6572)
BEST EPOCH: Epoch(2) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6510) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6567)
BEST EPOCH: Epoch(3) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6487) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6563)
BEST EPOCH: Epoch(4) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6472) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6559)
BEST EPOCH: Epoch(5) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6471) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6555)
BEST EPOCH: Epoch(6) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6519) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6551)
BEST EPOCH: Epoch(7) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6591) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6548)

I did work with the BERT model very recently, but I’m not allowed to share the code publicly.

This time did a more thorough pass through the code. You are clearing the gradients (with optim.zero_grad() before applying them (optim.step()). Basically, the model is not learning anything. Put the optim.zero_grad() call at the beginning of the for loop.

loss.backward()
# update weights
optim.zero_grad() <- gradients are zeroed out after this call
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optim.step()
1 Like

THANKS SO MUCH for your time, you don’t know how much I tryed to fix
this stupid problem :heart: