I’m using the following code
**Import the Libraries : **
! pip install transformers
import pandas as pd
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, f1_score
from transformers import AutoModel, BertTokenizer, AdamW
from sklearn.utils.class_weight import compute_class_weight
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as pp
**Prepare the Data : **
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
train_set = pd.read_csv('/content/drive/My Drive/train_set.csv')
val_set = pd.read_csv('/content/drive/My Drive/val_set.csv')
X_train, y_train = train_set['text'], train_set['label']
X_val, y_val = val_set['text'], val_set['label']
# compute the class weights
class_wts = compute_class_weight('balanced', pp.unique(y_train), y_train)
# convert class weights to tensor
weights = torch.tensor(class_wts, dtype=torch.float)
weights = weights.to(device)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
X_train = tokenizer.batch_encode_plus(
X_train.tolist(),
truncation=True, return_token_type_ids=False, return_length=True, pad_to_max_length=30,
return_attention_mask=True)
train_label = torch.tensor(train_set['label'].tolist(), dtype=torch.long)
train_length = torch.tensor(X_train['length']) # I will not use this In the training for now
train_input = torch.tensor(X_train['input_ids'])
train_attention_mask = torch.tensor(X_train['attention_mask'])
train_set = TensorDataset(train_input, train_length, train_attention_mask, train_label)
X_val = tokenizer.batch_encode_plus(
X_val.tolist(),
truncation=True, return_token_type_ids=False, return_length=True, pad_to_max_length=30, return_attention_mask=True)
val_label = torch.tensor(val_set['label'].tolist(), dtype=torch.long)
val_length = torch.tensor(X_val['length']) # I will not use this In the training for now
val_input = torch.tensor(X_val['input_ids'])
val_attention_mask = torch.tensor(X_val['attention_mask'])
val_set = TensorDataset(val_input, val_length, val_attention_mask, val_label)
train_loader = DataLoader(train_set, batch_size=50, sampler=RandomSampler(train_set))
val_loader = DataLoader(val_set, batch_size=50, sampler=SequentialSampler(val_set))
**Training the model : **
class BERT(nn.Module):
def __init__(self):
super(BERT, self).__init__()
self.bert = AutoModel.from_pretrained('bert-base-uncased')
for param in self.bert.parameters():
param.requires_grad = False
self.linear = nn.Linear(self.bert.config.hidden_size, 200)
self.drop = nn.Dropout(.5)
self.output = nn.Linear(200, 2)
def forward(self, x, mask, len):
_, x = self.bert(x, attention_mask=mask)
x = self.drop(x)
x = nn.functional.relu(self.linear(x))
x = self.output(x)
return x
model = BERT()
model.to(device)
weights= torch.tensor(class_wts,dtype=torch.float)
weights = weights.to(device)
optimizer = AdamW(model.parameters(), lr=.01)
cross_entropy = nn.NLLLoss(weight=weights).to(device)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=.1, patience=2, verbose=True)
def big_training_loop(epochs, optim, clf, lossf, train_loader, val_loader, accuracy_score):
# variables for early stopping
n_epochs_stop = 10
epochs_no_improve = 0
min_val_loss = float('inf')
# running epoch
for epoch in range(1, epochs + 1):
# variables for performance monitoring
loss_train_list = []
loss_val_list = []
correct_train_list_fscroe = []
correct_train_list = []
correct_val_list_fscroe = []
correct_val_list = []
# training on batches
for (text, lens, mask, labels), (val_text, val_lens, val_mask, val_labels) in zip(train_loader, val_loader):
# - training section - #
clf.train()
text, lens, mask, labels = text.to(device),lens , mask.to(device), labels.to(device)
output = clf(text, mask, lens)
# compute loss function
loss = lossf(output, labels)
# append loss output to loss_train_list for mnitoring
loss_train_list.append(loss.item())
# convert tensors to numpy array
labels = labels.cpu().detach().numpy()
output = torch.argmax(output, dim=1).cpu().detach().numpy()
# calculate accuracy score
correct_train_list_fscroe.append(f1_score(output, labels, average='macro'))
correct_train_list.append(accuracy_score(output, labels))
# compute back propagation
loss.backward()
# update weights
optim.zero_grad()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optim.step()
# clean gradients to not accumulate
# - evaluation section - #
clf.eval()
with torch.no_grad():
# - training section - #
val_text, val_lens, val_mask, val_labels = val_text.to(device), val_lens.to(device), val_mask.to(device), val_labels.to(device)
val_output = clf(val_text, val_mask, val_lens).squeeze()
# compute loss function
val_loss = lossf(val_output, val_labels)
# append loss output to loss_val_list for mnitoring
loss_val_list.append(val_loss.item())
# convert tensors to numpy array
val_output = torch.argmax(val_output, dim=1).cpu().detach().numpy()
val_labels = val_labels.cpu().detach().numpy()
# calculate accuracy score
correct_val_list.append(accuracy_score(val_output, val_labels))
correct_val_list_fscroe.append(f1_score(val_output, val_labels, average='macro'))
# change lr if the loss didn't decrease
loss = torch.mean(torch.FloatTensor(loss_train_list))
val_loss = torch.mean(torch.FloatTensor(loss_val_list))
acc = torch.mean(torch.FloatTensor(correct_train_list)) * 100
fscore = torch.mean(torch.FloatTensor(correct_train_list_fscroe)) * 100
val_acc = torch.mean(torch.FloatTensor(correct_val_list)) * 100
val_fscore = torch.mean(torch.FloatTensor(correct_val_list_fscroe)) * 100
scheduler.step(val_loss)
# save best model if the loss is the best
if torch.mean(torch.FloatTensor(loss_val_list)) < min_val_loss:
# variables for best performance
best_epoch = epoch
best_loss = loss
best_val_loss = val_loss
best_acc = acc
best_fscore = fscore
best_val_acc = val_acc
best_val_fscore = val_fscore
# save best model
epochs_no_improve = 0
min_val_loss = best_val_loss
# print the current epoch as the best epoch
print(
f'BEST EPOCH: Epoch({epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
else:
# print the current epoch as normal epoch
print(
f'Epoch({epoch}) -> Train: (Accuracy: {acc:.1f}, f-score: {fscore:.1f}, Loss: {loss:.4f}) | Val: (Accuracy: {val_acc:.1f}, f-score: {val_fscore:.1f},Loss: {val_loss:.4f})')
# if epochs_no_improve reached n_epochs_stop the training will stop
epochs_no_improve += 1
# early stop the training
if epoch > 5 and epochs_no_improve == n_epochs_stop:
torch.save(clf, f'clf_val_loss_{best_val_loss:.4f}_f-score_{best_val_fscore:.1f}_val_acc_{best_val_acc:.1f}.pt')
print('Early stopping!')
print()
print(
f'BEST EPOCH: Epoch({best_epoch}) -> Train: (Accuracy: {best_acc:.1f}, f-score: {best_fscore:.1f}, Loss: {best_loss:.4f}) | Val: (Accuracy: {best_val_acc:.1f}, f-score: {best_val_fscore:.1f}, Loss: {best_val_loss:.4f})')
break
print('Running...')
big_training_loop(555, optimizer, model, cross_entropy, train_loader, val_loader, accuracy_score=accuracy_score)
Running...
BEST EPOCH: Epoch(1) -> Train: (Accuracy: 28.4, f-score: 25.2, Loss: -0.0608) | Val: (Accuracy: 31.3, f-score: 23.5, Loss: -0.0249)
Epoch(2) -> Train: (Accuracy: 28.0, f-score: 25.4, Loss: -0.0458) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(3) -> Train: (Accuracy: 32.5, f-score: 29.5, Loss: -0.0696) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(4) -> Train: (Accuracy: 36.0, f-score: 33.0, Loss: -0.0618) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(5) -> Train: (Accuracy: 31.1, f-score: 27.0, Loss: -0.0599) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(6) -> Train: (Accuracy: 35.1, f-score: 31.0, Loss: -0.0712) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(7) -> Train: (Accuracy: 34.0, f-score: 31.3, Loss: -0.0641) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(8) -> Train: (Accuracy: 34.7, f-score: 30.9, Loss: -0.0722) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(9) -> Train: (Accuracy: 31.8, f-score: 28.1, Loss: -0.0570) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(10) -> Train: (Accuracy: 35.3, f-score: 31.3, Loss: -0.0598) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Epoch(11) -> Train: (Accuracy: 38.0, f-score: 33.8, Loss: -0.0860) | Val: (Accuracy: 31.3, f-score: 23.5,Loss: -0.0249)
Early stopping!
BEST EPOCH: Epoch(1) -> Train: (Accuracy: 28.4, f-score: 25.2, Loss: -0.0608) | Val: (Accuracy: 31.3, f-score: 23.5, Loss: -0.0249)
I don’t know why the accuracy is so Low; I tried many loss functions.
The way bert is implemented is inspired from this tutorial