Why I get worse accuracy when using BERT?

Bahaa_Kattan · November 6, 2020, 2:46pm

I’m using the following code

**Import the Libraries : **

! pip install transformers

import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModel, BertTokenizer, AdamW
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as pp

**Prepare the Data : **

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

train_set = pd.read_csv('/content/drive/My Drive/train_set.csv')
val_set = pd.read_csv('/content/drive/My Drive/val_set.csv')
X_train, y_train = train_set['text'], train_set['label']
X_val, y_val = val_set['text'], val_set['label']

# compute the class weights
class_wts = compute_class_weight('balanced', pp.unique(y_train), y_train)

# convert class weights to tensor
weights = torch.tensor(class_wts, dtype=torch.float)
weights = weights.to(device)

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

X_train = tokenizer.batch_encode_plus(
    X_train.tolist(),
    truncation=True, return_token_type_ids=False, return_length=True, pad_to_max_length=30,
    return_attention_mask=True)


train_label = torch.tensor(train_set['label'].tolist(), dtype=torch.long)

train_length = torch.tensor(X_train['length']) # I will not use this In the training for now 
train_input = torch.tensor(X_train['input_ids'])
train_attention_mask = torch.tensor(X_train['attention_mask'])
train_set = TensorDataset(train_input, train_length, train_attention_mask, train_label)


X_val = tokenizer.batch_encode_plus(
    X_val.tolist(),
    truncation=True, return_token_type_ids=False, return_length=True, pad_to_max_length=30, return_attention_mask=True)

val_label = torch.tensor(val_set['label'].tolist(), dtype=torch.long)
val_length = torch.tensor(X_val['length']) # I will not use this In the training for now 
val_input = torch.tensor(X_val['input_ids'])
val_attention_mask = torch.tensor(X_val['attention_mask'])
val_set = TensorDataset(val_input, val_length, val_attention_mask, val_label)

train_loader = DataLoader(train_set, batch_size=50, sampler=RandomSampler(train_set))
val_loader = DataLoader(val_set, batch_size=50, sampler=SequentialSampler(val_set))

**Training the model : **

class BERT(nn.Module):
    def __init__(self):
        super(BERT, self).__init__()
        self.bert = AutoModel.from_pretrained('bert-base-uncased')
        for param in self.bert.parameters():
            param.requires_grad = False
        self.linear = nn.Linear(self.bert.config.hidden_size, 200)
        self.drop = nn.Dropout(.5)
        self.output = nn.Linear(200, 2)

    def forward(self, x, mask, len):
        _, x = self.bert(x, attention_mask=mask)
        x = self.drop(x)
        x = nn.functional.relu(self.linear(x))
        x = self.output(x)
        return x

model = BERT()

model.to(device)
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)

optimizer = AdamW(model.parameters(), lr=.01)
cross_entropy = nn.NLLLoss(weight=weights).to(device)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1, patience=2, verbose=True)


def big_training_loop(epochs, optim, clf, lossf, train_loader, val_loader, accuracy_score):
    # variables for early stopping
    n_epochs_stop = 10
    epochs_no_improve = 0
    min_val_loss = float('inf')

    # running epoch
    for epoch in range(1, epochs + 1):

        # variables for performance monitoring
        loss_train_list = []
        loss_val_list = []
        correct_train_list_fscroe = []
        correct_train_list = []
        correct_val_list_fscroe = []
        correct_val_list = []

        # training on batches
        for (text, lens, mask, labels), (val_text, val_lens, val_mask, val_labels) in zip(train_loader, val_loader):
            # - training section - #
            clf.train()
            text, lens, mask, labels = text.to(device),lens , mask.to(device), labels.to(device)
            output = clf(text, mask, lens)
            # compute loss function
            loss = lossf(output, labels)
            # append loss output to loss_train_list for mnitoring
            loss_train_list.append(loss.item())

            # convert tensors to numpy array
            labels = labels.cpu().detach().numpy()
            output = torch.argmax(output, dim=1).cpu().detach().numpy()
            # calculate accuracy score
            correct_train_list_fscroe.append(f1_score(output, labels, average='macro'))
            correct_train_list.append(accuracy_score(output, labels))
            # compute back propagation
            loss.backward()
            # update weights
            optim.zero_grad()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()
            # clean gradients to not accumulate

            # - evaluation section - #
            clf.eval()
            with torch.no_grad():
                # - training section - #
                val_text, val_lens, val_mask, val_labels = val_text.to(device), val_lens.to(device), val_mask.to(device), val_labels.to(device)
                val_output = clf(val_text, val_mask, val_lens).squeeze()

                # compute loss function
                val_loss = lossf(val_output, val_labels)
                # append loss output to loss_val_list for mnitoring
                loss_val_list.append(val_loss.item())

                # convert tensors to numpy array
                val_output = torch.argmax(val_output, dim=1).cpu().detach().numpy()
                val_labels = val_labels.cpu().detach().numpy()

                # calculate accuracy score
                correct_val_list.append(accuracy_score(val_output, val_labels))
                correct_val_list_fscroe.append(f1_score(val_output, val_labels, average='macro'))
              
            # change lr if the loss didn't decrease

            loss = torch.mean(torch.FloatTensor(loss_train_list))
            val_loss = torch.mean(torch.FloatTensor(loss_val_list))
            acc = torch.mean(torch.FloatTensor(correct_train_list)) * 100
            fscore = torch.mean(torch.FloatTensor(correct_train_list_fscroe)) * 100
            val_acc = torch.mean(torch.FloatTensor(correct_val_list)) * 100
            val_fscore = torch.mean(torch.FloatTensor(correct_val_list_fscroe)) * 100

        scheduler.step(val_loss)

        # save best model if the loss is the best
        if torch.mean(torch.FloatTensor(loss_val_list)) < min_val_loss:
          # variables for best performance
          best_epoch = epoch
          best_loss = loss
          best_val_loss = val_loss
          best_acc = acc
          best_fscore = fscore
          best_val_acc = val_acc
          best_val_fscore = val_fscore
          # save best model
          epochs_no_improve = 0
          min_val_loss = best_val_loss
          # print the current epoch as the best epoch

          print(
              f'BEST EPOCH: Epoch({epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')

        else:
            # print the current epoch as normal epoch
            print(
                f'Epoch({epoch}) -> Train: (Accuracy: {acc:.1f}, f-score: {fscore:.1f}, Loss: {loss:.4f}) | Val: (Accuracy: {val_acc:.1f}, f-score: {val_fscore:.1f},Loss: {val_loss:.4f})')
            # if epochs_no_improve reached n_epochs_stop the training will stop
            epochs_no_improve += 1

        # early stop the training
        if epoch > 5 and epochs_no_improve == n_epochs_stop:
            torch.save(clf, f'clf_val_loss_{best_val_loss:.4f}_f-score_{best_val_fscore:.1f}_val_acc_{best_val_acc:.1f}.pt')
            print('Early stopping!')
            print()
            print(
                f'BEST EPOCH: Epoch({best_epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
            break

print('Running...')
big_training_loop(555, optimizer, model, cross_entropy, train_loader, val_loader, accuracy_score=accuracy_score)

Running...
BEST EPOCH: Epoch(1) -> Train: (Accuracy: 28.4, f-score: 25.2, Loss: -0.0608) | Val: (Accuracy: 31.3, f-score: 23.5, Loss: -0.0249)
Epoch(2) -> Train: (Accuracy: 28.0, f-score: 25.4, Loss: -0.0458) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(3) -> Train: (Accuracy: 32.5, f-score: 29.5, Loss: -0.0696) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(4) -> Train: (Accuracy: 36.0, f-score: 33.0, Loss: -0.0618) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(5) -> Train: (Accuracy: 31.1, f-score: 27.0, Loss: -0.0599) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(6) -> Train: (Accuracy: 35.1, f-score: 31.0, Loss: -0.0712) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(7) -> Train: (Accuracy: 34.0, f-score: 31.3, Loss: -0.0641) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(8) -> Train: (Accuracy: 34.7, f-score: 30.9, Loss: -0.0722) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(9) -> Train: (Accuracy: 31.8, f-score: 28.1, Loss: -0.0570) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(10) -> Train: (Accuracy: 35.3, f-score: 31.3, Loss: -0.0598) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(11) -> Train: (Accuracy: 38.0, f-score: 33.8, Loss: -0.0860) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Early stopping!

BEST EPOCH: Epoch(1) -> Train: (Accuracy: 28.4, f-score: 25.2, Loss: -0.0608) | Val: (Accuracy: 31.3, f-score: 23.5, Loss: -0.0249)

I don’t know why the accuracy is so Low; I tried many loss functions.

The way bert is implemented is inspired from this tutorial

mariosasko · November 6, 2020, 4:40pm

Why would you freeze the model params? Yes, the model is pretrained, but it has to be fine-tuned on the downstream task. Without that, it will act as a random classifier most likely. And for the loss function, use CrossEntropyLoss. Btw, instead of defining you own BERT for sequence classification, you can use the built-in BertForSequenceClassification from HuggingFace Transformers.

Bahaa_Kattan · November 6, 2020, 6:35pm

I changed the classifier to :

self.bert = BertForSequenceClassification.from_pretrained('bert-base-uncased')
and for the forward function:
x = self.bert(x, attention_mask=mask)

but I got another problem which is that the output of self.bert is
a tuple containing a tensor and I want to extract the tensor from that tuple:

(tensor([[0.6515, 0.6653],
        [0.2553, 0.1504],
        [0.4994, 0.4797],
        [0.6337, 0.2966],
        [0.2209, 0.5274],
        [0.5424, 0.3485],
        [0.3630, 0.1887],
        [0.2545, 0.7262],
        [0.2021, 0.4187],
        [0.5003, 0.1696],
        [0.4143, 0.5789],
        [0.4960, 0.4392],
        [0.4121, 0.5264],
        [0.5776, 0.4939],
        [0.3785, 0.4943],
        [0.4546, 0.4430],
        [0.6860, 0.3314],
        [0.4527, 0.3885],
        [0.2221, 0.6505],
        [0.3507, 0.2968],
        [0.3292, 0.6263],
        [0.3475, 0.3593],
        [0.1828, 0.5570],
        [0.5490, 0.1212],
        [0.2501, 0.2750],
        [0.3481, 0.4373],
        [0.3475, 0.7435],
        [0.4671, 0.2719],
        [0.3621, 0.4031],
        [0.3909, 0.2389],
        [0.3114, 0.5124],
        [0.1240, 0.3718],
        [0.1745, 0.2922],
        [0.3455, 0.4789],
        [0.4206, 0.3542],
        [0.2758, 0.2613],
        [0.2718, 0.2825],
        [0.2359, 0.4479],
        [0.5177, 0.3300],
        [0.5761, 0.4069],
        [0.5057, 0.5138],
        [0.1883, 0.3227],
        [0.6071, 0.4967],
        [0.4363, 0.3138],
        [0.2681, 0.2961],
        [0.3116, 0.4401],
        [0.2646, 0.8440],
        [0.1167, 0.4557],
        [0.3569, 0.2142],
        [0.4936, 0.2005]], device='cuda:0'),)

and here is the error :

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-9-360b9852dfa7> in <module>()
    137 
    138 print('Running...')
--> 139 big_training_loop(555, optimizer, model, cross_entropy, train_loader, val_loader, accuracy_score=accuracy_score)
    140 

5 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
   1686         if any([type(t) is not Tensor for t in tens_ops]) and has_torch_function(tens_ops):
   1687             return handle_torch_function(linear, tens_ops, input, weight, bias=bias)
-> 1688     if input.dim() == 2 and bias is not None:
   1689         # fused op is marginally faster
   1690         ret = torch.addmm(bias, input, weight.t())

AttributeError: 'tuple' object has no attribute 'dim'

how should I fix it ?

Bahaa_Kattan · November 7, 2020, 7:25pm

I fixed this issue by doing

        with torch.no_grad():
            x = self.bert(x)[0]

but still get worse accuracy

mariosasko · November 7, 2020, 11:02pm

Did you remove the lines I copied below and change the loss function to torch.nn.CrossEntropyLoss?

for param in self.bert.parameters():
    param.requires_grad = False

Bahaa_Kattan · November 8, 2020, 7:38pm

yes.
do you have an example about how to implement BERT ?
I have more then 14 days trying to do it from all kinds of tutorials but I faced a lot of troubles

Bahaa_Kattan · November 8, 2020, 7:44pm

here is another script from different tutorial with the same problem…

Import the Libraries:

from transformers import BertTokenizer, BertForSequenceClassification
import torch, time
import torch.optim as optim
import torch.nn as nn
from sklearn.metrics import f1_score, accuracy_score

import random
import numpy as np
import pandas as pd
from torchtext import data
from torchtext.data import TabularDataset
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Prepare Data:

model_class, tokenizer_class, pretrained_weights = (BertForSequenceClassification, BertTokenizer, 'bert-base-cased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)
df_train = pd.read_csv("/content/drive/My Drive/train_set.csv")
df_val = pd.read_csv("/content/drive/My Drive/val_set.csv")
df = pd.concat([df_train, df_val], axis=0)

tokenized = df_train['text'].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

max_len = 0
for i in tokenized.values:
    if len(i) > max_len:
        max_len = len(i)

padded = np.array([i + [0]*(max_len-len(i)) for i in tokenized.values])
attention_mask = np.where(padded != 0, 1, 0)

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
class Dataset(torch.utils.data.Dataset):
  def __init__(self, input_ids, attention_masks, labels):
        self.input_ids = input_ids
        self.attention_mask = attention_masks
        self.labels = labels

  def __len__(self):
        return len(self.input_ids)

  def __getitem__(self, index):
        input_ids = self.input_ids[index]
        attention_mask = self.attention_mask[index]
        labels = self.labels[index]
        return input_ids, attention_mask, labels

train_input_ids = torch.tensor(padded[:len(df_val)])  
train_attention_mask = torch.tensor(attention_mask[:len(df_val)])
train_labels = list(df['label'][:len(df_val)])

validation_input_ids = torch.tensor(padded[len(df_val):])  
validation_attention_mask = torch.tensor(attention_mask[len(df_val):])
validation_labels = list(df['label'][len(df_val):])


dataset = Dataset(input_ids=train_input_ids, attention_masks=train_attention_mask, labels=train_labels)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=8, drop_last=False)

validation_dataset = Dataset(input_ids=validation_input_ids, attention_masks=validation_attention_mask, labels=validation_labels)
validation_dataloader = torch.utils.data.DataLoader(validation_dataset, batch_size=8, drop_last=False)

Train the model:

optimizer = torch.optim.AdamW(model.parameters(), lr=.01)
cross_entropy = nn.CrossEntropyLoss().to(device)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1, patience=2, verbose=True)

model.to(device)
def big_training_loop(epochs, optim, clf, lossf, train_loader, val_loader, accuracy_score):
    # variables for early stopping
    n_epochs_stop = 10
    epochs_no_improve = 0
    min_val_loss = float('inf')

    # running epoch
    for epoch in range(1, epochs + 1):

        # variables for performance monitoring
        loss_train_list = []
        loss_val_list = []
        correct_train_list_fscroe = []
        correct_train_list = []
        correct_val_list_fscroe = []
        correct_val_list = []

        # training on batches
        for (text, mask, labels), (val_text, val_mask, val_labels) in zip(train_loader, val_loader):
            # - training section - #
            clf.train()
            text, mask, labels = text.to(device), mask.to(device), labels.to(device)
            output = model(text, token_type_ids=None, attention_mask=mask)[0]
            # compute loss function
            loss = lossf(output, labels)
            # append loss output to loss_train_list for mnitoring
            loss_train_list.append(loss.item())

            # convert tensors to numpy array
            labels = labels.cpu().detach().numpy()
            output = torch.argmax(output, dim=1).cpu().detach().numpy()
            # calculate accuracy score
            correct_train_list_fscroe.append(f1_score(output, labels, average='macro'))
            correct_train_list.append(accuracy_score(output, labels))
            # compute back propagation
            loss.backward()
            # update weights
            optim.zero_grad()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optim.step()
            # clean gradients to not accumulate

            # - evaluation section - #
            clf.eval()
            with torch.no_grad():
                # - training section - #
                val_text, val_mask, val_labels = val_text.to(device), val_mask.to(device), val_labels.to(device)
                val_output = model(val_text, token_type_ids=None, attention_mask=val_mask)[0]
                # compute loss function
                val_loss = lossf(val_output, val_labels)
                # append loss output to loss_val_list for mnitoring
                loss_val_list.append(val_loss.item())

                # convert tensors to numpy array
                val_output = torch.argmax(val_output, dim=1).cpu().detach().numpy()
                val_labels = val_labels.cpu().detach().numpy()

                # calculate accuracy score
                correct_val_list.append(accuracy_score(val_output, val_labels))
                correct_val_list_fscroe.append(f1_score(val_output, val_labels, average='macro'))

            # change lr if the loss didn't decrease

            loss = torch.mean(torch.FloatTensor(loss_train_list))
            val_loss = torch.mean(torch.FloatTensor(loss_val_list))
            acc = torch.mean(torch.FloatTensor(correct_train_list)) * 100
            fscore = torch.mean(torch.FloatTensor(correct_train_list_fscroe)) * 100
            val_acc = torch.mean(torch.FloatTensor(correct_val_list)) * 100
            val_fscore = torch.mean(torch.FloatTensor(correct_val_list_fscroe)) * 100

        scheduler.step(val_loss)

        # save best model if the loss is the best
        if torch.mean(torch.FloatTensor(loss_val_list)) < min_val_loss:
            # variables for best performance
            best_epoch = epoch
            best_loss = loss
            best_val_loss = val_loss
            best_acc = acc
            best_fscore = fscore
            best_val_acc = val_acc
            best_val_fscore = val_fscore
            # save best model
            epochs_no_improve = 0
            min_val_loss = best_val_loss
            # print the current epoch as the best epoch

            print(
                f'BEST EPOCH: Epoch({epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')

        else:
            # print the current epoch as normal epoch
            print(
                f'Epoch({epoch}) -> Train: (Accuracy: {acc:.1f}, f-score: {fscore:.1f}, Loss: {loss:.4f}) | Val: (Accuracy: {val_acc:.1f}, f-score: {val_fscore:.1f},Loss: {val_loss:.4f})')
            # if epochs_no_improve reached n_epochs_stop the training will stop
            epochs_no_improve += 1

        # early stop the training
        if epoch > 5 and epochs_no_improve == n_epochs_stop:
            torch.save(clf,
                       f'clf_val_loss_{best_val_loss:.4f}_f-score_{best_val_fscore:.1f}_val_acc_{best_val_acc:.1f}.pt')
            print('Early stopping!')
            print()
            print(
                f'BEST EPOCH: Epoch({best_epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
            break


print('Running...')
big_training_loop(555, optimizer, model, cross_entropy, dataloader, validation_dataloader, accuracy_score=accuracy_score)

Running...
BEST EPOCH: Epoch(1) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6451) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6572)
BEST EPOCH: Epoch(2) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6510) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6567)
BEST EPOCH: Epoch(3) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6487) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6563)
BEST EPOCH: Epoch(4) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6472) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6559)
BEST EPOCH: Epoch(5) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6471) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6555)
BEST EPOCH: Epoch(6) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6519) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6551)
BEST EPOCH: Epoch(7) -> Train: (Accuracy: 66.0, f-score: 42.7, Loss: 0.6591) | Val: (Accuracy: 65.3, f-score: 45.0, Loss: 0.6548)

mariosasko · November 8, 2020, 8:19pm

I did work with the BERT model very recently, but I’m not allowed to share the code publicly.

This time did a more thorough pass through the code. You are clearing the gradients (with optim.zero_grad() before applying them (optim.step()). Basically, the model is not learning anything. Put the optim.zero_grad() call at the beginning of the for loop.

loss.backward()
# update weights
optim.zero_grad() <- gradients are zeroed out after this call
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optim.step()

Bahaa_Kattan · November 8, 2020, 9:15pm

THANKS SO MUCH for your time, you don’t know how much I tryed to fix
this stupid problem