import transformers
from transformers import BertModel, BertTokenizer
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import torch
import numpy as np
import pandas as pd
import seaborn as sns
from pylab import rcParams
import matplotlib.pyplot as plt
from matplotlib import rc
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
from collections import defaultdict
from textwrap import wrap
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader
import time
import logging
logging.basicConfig(filename=‘bert_classifier.log’,level=logging.DEBUG)
logging.debug(‘This message should go to the log file’)
logging.info(‘So should this’)
logging.warning(‘And this, too’)
import logging
logging.basicConfig(level=logging.ERROR)
sns.set(style=‘whitegrid’, palette=‘muted’, font_scale=1.2)
HAPPY_COLORS_PALETTE = [“#01BEFE”, “#FFDD00”, “#FF7D00”, “#FF006D”, “#ADFF02”, “#8F00FF”]
sns.set_palette(sns.color_palette(HAPPY_COLORS_PALETTE))
#rcParams[‘figure.figsize’] = 12, 8
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
device = torch.device(“cuda” if torch.cuda.is_available() else “cpu”)
df = pd.read_csv(“”)
#df.Text = df.Title + “.” + df.Bullet + “.” + df.Description
def to_sentiment(rating):
rating = int(rating)
if rating <= 2:
return 0
elif rating == 3:
return 1
else:
return 2
class_names = [0,1,2]
PRE_TRAINED_MODEL_NAME = ‘bert-base-uncased’
tokenizer = BertTokenizer.from_pretrained(PRE_TRAINED_MODEL_NAME)
MAX_LEN = 150
class GPReviewDataset(Dataset):
def init(self, reviews, targets, tokenizer, max_len):
self.reviews = reviews
self.targets = targets
self.tokenizer = tokenizer
self.max_len = max_len
def len(self):
return len(self.reviews)
def getitem(self, item):
review = str(self.reviews[item])
target = self.targets[item]
encoding = self.tokenizer.encode_plus(
review,
add_special_tokens=True,
max_length=self.max_len,
return_token_type_ids=False,
pad_to_max_length=True,
return_attention_mask=True,
return_tensors='pt',
truncation=True
)
return {
'review_text': review,
'input_ids': encoding['input_ids'].flatten(),
'attention_mask': encoding['attention_mask'].flatten(),
'targets': torch.tensor(target, dtype=torch.long)
}
df_train, df_val = train_test_split(df, test_size=0.1, random_state=RANDOM_SEED)
def create_data_loader(df, tokenizer, max_len, batch_size):
ds = GPReviewDataset(
reviews=df.Title.to_numpy(),#df.Text.to_numpy(),
targets=df.Flag.to_numpy(),
tokenizer=tokenizer,
max_len=max_len
)
return DataLoader(
ds,
batch_size=batch_size,
num_workers=0,
drop_last=True
)
BATCH_SIZE = 128
train_data_loader = create_data_loader(df_train, tokenizer, MAX_LEN, BATCH_SIZE)
val_data_loader = create_data_loader(df_val, tokenizer, MAX_LEN, BATCH_SIZE)
bert_model = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
class SentimentClassifier(nn.Module):
def init(self, n_classes):
super(SentimentClassifier, self).init()
self.bert = BertModel.from_pretrained(PRE_TRAINED_MODEL_NAME)
self.drop = nn.Dropout(p=0.3)
self.linear = nn.Linear(self.bert.config.hidden_size, n_classes)
self.out = nn.Softmax(dim=1)
def forward(self, input_ids, attention_mask):
_, pooled_output = self.bert(
input_ids=input_ids,
attention_mask=attention_mask
)
output = self.drop(pooled_output)
output_1 = self.linear(output)
return self.out(output_1)
model = SentimentClassifier(len(class_names))
model = torch.nn.DataParallel(model)
model = model.to(device)
EPOCHS = 5
optimizer = AdamW(model.parameters(), lr=2e-5, correct_bias=False)
total_steps = len(train_data_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=total_steps
)
loss_fn = nn.CrossEntropyLoss().to(device)
def show_gpu(msg):
“”"
ref: Access GPU memory usage in Pytorch - #4 by mjstevens777
“”"
def query(field):
return(subprocess.check_output(
[‘nvidia-smi’, f’–query-gpu={field}‘,
‘–format=csv,nounits,noheader’],
encoding=‘utf-8’))
def to_int(result):
return int(result.strip().split(’\n’)[0])
used = to_int(query('memory.used'))
total = to_int(query('memory.total'))
pct = used/total
logger.info('\n' + msg, f'{100*pct:2.1f}% ({used} out of {total})')
def train_epoch(
model,
data_loader,
loss_fn,
optimizer,
device,
scheduler,
n_examples
):
model = model.train()
losses =
correct_predictions = 0
index = 0
for d in data_loader:
try:
index = index + 1
input_ids = d[“input_ids”].to(device)
attention_mask = d[“attention_mask”].to(device)
targets = d[“targets”].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
optimizer.step()
scheduler.step()
optimizer.zero_grad()
if index%50==0:
logging.debug("finished steps: {}".format(index))
except Exception as e:
logging.debug("error in training step: {}".format(e))
return correct_predictions.double() / n_examples, np.mean(losses)
def eval_model(model, data_loader, loss_fn, device, n_examples):
model = model.eval()
losses =
correct_predictions = 0
try:
with torch.no_grad():
for d in data_loader:
input_ids = d[“input_ids”].to(device)
attention_mask = d[“attention_mask”].to(device)
targets = d[“targets”].to(device)
outputs = model(
input_ids=input_ids,
attention_mask=attention_mask
)
_, preds = torch.max(outputs, dim=1)
#print(outputs)
loss = loss_fn(outputs, targets)
correct_predictions += torch.sum(preds == targets)
losses.append(loss.item())
except Exception as e:
logger.info(“eval error {}”.format(e))
return correct_predictions.double() / n_examples, np.mean(losses)
history = defaultdict(list)
best_accuracy = 0
start_time = time.time()
show_gpu(‘Initial GPU memory usage:’)
for epoch in range(EPOCHS):
logging.info(f’Epoch {epoch + 1}/{EPOCHS}‘)
logging.info(’-’ * 10)
train_acc, train_loss = train_epoch(
model,
train_data_loader,
loss_fn,
optimizer,
device,
scheduler,
len(df_train)
)
show_gpu(‘GPU memory usage after training model in epoch {}’.format(epoch))
logging.info(f’Train loss {train_loss} accuracy {train_acc}')
logging.info(“time taken from start: {}”.format(time.time() - start_time))
val_acc, val_loss = eval_model(
model,
val_data_loader,
loss_fn,
device,
len(df_val)
)
print(f’Val loss {val_loss} accuracy {val_acc}')
logging.info(“val accuracy {}, validation_loss {}”.format(val_acc, val_loss))
history[‘train_acc’].append(train_acc)
history[‘train_loss’].append(train_loss)
history[‘val_acc’].append(val_acc)
history[‘val_loss’].append(val_loss)
if val_acc > best_accuracy:
torch.save(model.state_dict(), ‘best_model_state_{}.bin’.format(epoch))
best_accuracy = val_acc