Hi everyone,
I am new to NLP and Pytorch. I am trying to solve a multi-class text classification problem. I have tried with two models one is a Multi-filter CNN network model and the other one is a simple Bert classifier model. For the Bert model, I have used the “bert-base-uncased” pre-trained model. However, I am always getting zero accuracies for the Bert model but not for the CNN model. For the CNN model, I have used “word2vec” embedding. I want to use the bert model for better result but I am not getting any success as of now. Maybe I am doing something wrong. I am attaching the whole code for both the models and the training and testing process below. Please suggest to me where I am doing wrong for the Bert model.
Models
class WordRep(nn.Module):
def __init__(self, args, Y, dicts):
super(WordRep, self).__init__()
self.gpu = args['gpu']
W = torch.Tensor(self.load_embeddings(args['embed_file']))
self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
self.embed.weight.data = W.clone()
self.feature_size = self.embed.embedding_dim
self.embed_drop = nn.Dropout(p=args['dropout'])
self.conv_dict = {
1: [self.feature_size, args['num_filter_maps']],
2: [self.feature_size, 100, args['num_filter_maps']],
3: [self.feature_size, 150, 100, args['num_filter_maps']],
4: [self.feature_size, 200, 150, 100, args['num_filter_maps']]
}
def forward(self, x, target):
features = [self.embed(x)]
x = torch.cat(features, dim=2)
x = self.embed_drop(x)
return x
def load_embeddings(self, embed_file):
#also normalizes the embeddings
W = []
with open(embed_file) as ef:
for line in ef:
line = line.rstrip().split()
vec = np.array(line[1:]).astype(np.float)
vec = vec / float(np.linalg.norm(vec) + 1e-6)
W.append(vec)
#UNK embedding, gaussian randomly initialized
print("adding unk embedding")
vec = np.random.randn(len(W[-1]))
vec = vec / float(np.linalg.norm(vec) + 1e-6)
W.append(vec)
W = np.array(W)
return W
class OutputLayer(nn.Module):
def __init__(self, args, Y, dicts, input_size):
super(OutputLayer, self).__init__()
self.U = nn.Linear(input_size, Y)
xavier_uniform(self.U.weight)
self.final = nn.Linear(input_size, Y)
xavier_uniform(self.final.weight)
self.loss_function = nn.BCEWithLogitsLoss()
def forward(self, x, target):
alpha = F.softmax(self.U.weight.matmul(x.transpose(1, 2)), dim=2)
m = alpha.matmul(x)
y = self.final.weight.mul(m).sum(dim=2).add(self.final.bias)
loss = self.loss_function(y, target)
return y, loss
class ResidualBlock(nn.Module):
def __init__(self, inchannel, outchannel, kernel_size, stride, use_res, dropout):
super(ResidualBlock, self).__init__()
self.left = nn.Sequential(
nn.Conv1d(inchannel, outchannel, kernel_size=kernel_size, stride=stride, padding=int(floor(kernel_size / 2)), bias=False),
nn.BatchNorm1d(outchannel),
nn.Tanh(),
nn.Conv1d(outchannel, outchannel, kernel_size=kernel_size, stride=1, padding=int(floor(kernel_size / 2)), bias=False),
nn.BatchNorm1d(outchannel)
)
self.use_res = use_res
if self.use_res:
self.shortcut = nn.Sequential(
nn.Conv1d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
nn.BatchNorm1d(outchannel)
)
self.dropout = nn.Dropout(p=dropout)
def forward(self, x):
out = self.left(x)
if self.use_res:
out += self.shortcut(x)
out = torch.tanh(out)
out = self.dropout(out)
return out
class MultiResedualCNN(nn.Module):
def __init__(self, args, Y, dicts):
super(MultiResedualCNN, self).__init__()
self.word_rep = WordRep(args, Y, dicts)
self.embedding_size = self.word_rep.embed.weight.data.size()[0]
self.conv = nn.ModuleList()
filter_sizes = args['filter_size'].split(',')
self.filter_num = len(filter_sizes)
for filter_size in filter_sizes:
filter_size = int(filter_size)
one_channel = nn.ModuleList()
tmp = nn.Conv1d(self.word_rep.feature_size, self.word_rep.feature_size, kernel_size=filter_size,
padding=int(floor(filter_size / 2)))
xavier_uniform(tmp.weight)
one_channel.add_module('baseconv', tmp)
conv_dimension = self.word_rep.conv_dict[args['conv_layer']]
for idx in range(args['conv_layer']):
tmp = ResidualBlock(conv_dimension[idx], conv_dimension[idx + 1], filter_size, 1, True,
args['dropout'])
one_channel.add_module('resconv-{}'.format(idx), tmp)
self.conv.add_module('channel-{}'.format(filter_size), one_channel)
self.output_layer = OutputLayer(args, Y, dicts, self.filter_num * args['num_filter_maps'])
def forward(self, x, target):
x = self.word_rep(x, target)
x = x.transpose(1, 2)
conv_result = []
for conv in self.conv:
tmp = x
for idx, md in enumerate(conv):
if idx == 0:
tmp = torch.tanh(md(tmp))
else:
tmp = md(tmp)
tmp = tmp.transpose(1, 2)
conv_result.append(tmp)
x = torch.cat(conv_result, dim=2)
y, loss = self.output_layer(x, target)
return y, loss
def freeze_net(self):
for p in self.word_rep.embed.parameters():
p.requires_grad = False
class Tr_Bert_V3(nn.Module):
def __init__(self, args, Y, dicts):
super(Tr_Bert_V3, self).__init__()
cache_path = os.path.join(args['bert_dir'], args['pretrained_bert'])
savedModel = None
if os.path.exists(cache_path):
savedModel = tr.BertModel.from_pretrained(cache_path, return_dict=True)
else:
savedModel = tr.BertModel.from_pretrained(str(args['pretrained_bert']), return_dict=True)
savedModel.save_pretrained(save_directory = cache_path, save_config=True)
self.bert = savedModel
self.config = savedModel.config
print("Model config {}".format(self.config))
self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
self.classifier = nn.Linear(self.config.hidden_size , Y)
self.apply(self.init_bert_weights)
def forward(self, input_ids, token_type_ids, attention_mask, target):
output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
x = self.dropout(output.pooler_output)
x = x / float(torch.linalg.norm(x) + 1e-6)
y = self.classifier(x)
loss = self.loss_fn(y, target)
return y, loss
def loss_fn(self, outputs, target):
return nn.BCEWithLogitsLoss()(outputs, target)
def init_bert_weights(self, module):
BertLayerNorm = torch.nn.LayerNorm
if isinstance(module, (nn.Linear, nn.Embedding)):
module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
elif isinstance(module, BertLayerNorm):
module.bias.data.zero_()
module.weight.data.fill_(1.0)
if isinstance(module, nn.Linear) and module.bias is not None:
module.bias.data.zero_()
def freeze_net(self):
pass
Train-Test
class Train_Test:
def __init__(self):
print("Train--Test")
def train(self, args, model, optimizer, epoch, gpu, data_loader):
print("EPOCH %d" % epoch)
losses = []
model.train()
# loader
data_iter = iter(data_loader)
num_iter = len(data_loader)
for i in tqdm(range(num_iter)):
optimizer.zero_grad()
if args['model'].find("Bert") != -1:
inputs_id, segments, masks, labels = next(data_iter)
inputs_id, segments, masks, labels = torch.LongTensor(inputs_id).cuda(gpu), torch.LongTensor(segments).cuda(gpu), \
torch.LongTensor(masks).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)
output, loss = model(inputs_id, segments, masks, labels)
else:
inputs_id, labels = next(data_iter)
inputs_id, labels = torch.LongTensor(inputs_id).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)
output, loss = model(inputs_id, labels)
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
losses.append(loss.item())
return losses
def test(self, args, model, data_path, fold, gpu, dicts, data_loader):
self.model_name = args['model']
num_labels = len(dicts['ind2c'])
y, yhat, yhat_raw, hids, losses = [], [], [], [], []
model.eval()
# loader
data_iter = iter(data_loader)
num_iter = len(data_loader)
for i in tqdm(range(num_iter)):
with torch.no_grad():
if args['model'].find("Bert") != -1:
inputs_id, segments, masks, labels = next(data_iter)
inputs_id, segments, masks, labels = torch.LongTensor(inputs_id).cuda(gpu), torch.LongTensor(segments).cuda(gpu),\
torch.LongTensor(masks).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)
output, loss = model(inputs_id, segments, masks, labels)
else:
inputs_id, labels = next(data_iter)
inputs_id, labels, = torch.LongTensor(inputs_id).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)
output, loss = model(inputs_id, labels)
output = torch.sigmoid(output)
output = output.data.cpu().numpy()
losses.append(loss.item())
target_data = labels.data.cpu().numpy()
yhat_raw.append(output)
output = np.round(output)
y.append(target_data)
yhat.append(output)
y = np.concatenate(y, axis=0)
yhat = np.concatenate(yhat, axis=0)
yhat_raw = np.concatenate(yhat_raw, axis=0)
self.print_metrics(yhat, y, yhat_raw=yhat_raw)
def print_metrics(self, yhat, y, yhat_raw):
"""
Inputs:
yhat: binary predictions matrix
y: binary ground truth matrix
yhat_raw: prediction scores matrix (floats)
Outputs:
dict holding relevant metrics
"""
names = ["acc", "prec", "rec", "f1"]
#macro
macro = self.all_macro(yhat, y)
#micro
ymic = y.ravel()
yhatmic = yhat.ravel()
micro = self.all_micro(yhatmic, ymic)
metrics = {names[i] + "_macro": macro[i] for i in range(len(macro))}
metrics.update({names[i] + "_micro": micro[i] for i in range(len(micro))})
roc_auc = self.auc_metrics(yhat_raw, y, ymic)
metrics.update(roc_auc)
self.print_result(metrics)
def auc_metrics(self, yhat_raw, y, ymic):
if yhat_raw.shape[0] <= 1:
return
fpr = {}
tpr = {}
roc_auc = {}
#get AUC for each label individually
relevant_labels = []
auc_labels = {}
for i in range(y.shape[1]):
#only if there are true positives for this label
if y[:,i].sum() > 0:
fpr[i], tpr[i], _ = roc_curve(y[:,i], yhat_raw[:,i])
if len(fpr[i]) > 1 and len(tpr[i]) > 1:
auc_score = auc(fpr[i], tpr[i])
if not np.isnan(auc_score):
auc_labels["auc_%d" % i] = auc_score
relevant_labels.append(i)
#macro-AUC: just average the auc scores
aucs = []
for i in relevant_labels:
aucs.append(auc_labels['auc_%d' % i])
roc_auc['auc_macro'] = np.mean(aucs)
#micro-AUC: just look at each individual prediction
yhatmic = yhat_raw.ravel()
fpr["micro"], tpr["micro"], _ = roc_curve(ymic, yhatmic)
roc_auc["auc_micro"] = auc(fpr["micro"], tpr["micro"])
return roc_auc
def all_micro(self, yhatmic, ymic):
return self.micro_accuracy(yhatmic, ymic), self.micro_precision(yhatmic, ymic), self.micro_recall(yhatmic, ymic), self.micro_f1(yhatmic, ymic)
def micro_f1(self, yhatmic, ymic):
prec = self.micro_precision(yhatmic, ymic)
rec = self.micro_recall(yhatmic, ymic)
if prec + rec == 0:
f1 = 0.
else:
f1 = 2*(prec*rec)/(prec+rec)
return f1
def micro_recall(self, yhatmic, ymic):
return self.intersect_size(yhatmic, ymic, 0) / (ymic.sum(axis=0) + 1e-10) #NaN fix
def micro_precision(self, yhatmic, ymic):
return self.intersect_size(yhatmic, ymic, 0) / (yhatmic.sum(axis=0) + 1e-10) #NaN fix
def micro_accuracy(self, yhatmic, ymic):
return self.intersect_size(yhatmic, ymic, 0) / (self.union_size(yhatmic, ymic, 0) + 1e-10) #NaN fix
def all_macro(self,yhat, y):
return self.macro_accuracy(yhat, y), self.macro_precision(yhat, y), self.macro_recall(yhat, y), self.macro_f1(yhat, y)
def macro_f1(self, yhat, y):
prec = self.macro_precision(yhat, y)
rec = self.macro_recall(yhat, y)
if prec + rec == 0:
f1 = 0.
else:
f1 = 2*(prec*rec)/(prec+rec)
return f1
def macro_recall(self, yhat, y):
num = self.intersect_size(yhat, y, 0) / (y.sum(axis=0) + 1e-10)
return np.mean(num)
def macro_precision(self, yhat, y):
num = self.intersect_size(yhat, y, 0) / (yhat.sum(axis=0) + 1e-10)
return np.mean(num)
def macro_accuracy(self, yhat, y):
num = self.intersect_size(yhat, y, 0) / (self.union_size(yhat, y, 0) + 1e-10)
return np.mean(num)
def intersect_size(self, yhat, y, axis):
#axis=0 for label-level union (macro). axis=1 for instance-level
return np.logical_and(yhat, y).sum(axis=axis).astype(float)
def union_size(self, yhat, y, axis):
#axis=0 for label-level union (macro). axis=1 for instance-level
return np.logical_or(yhat, y).sum(axis=axis).astype(float)
def print_result(self, metrics):
print()
print("[MACRO] accuracy, precision, recall, f-measure, AUC")
print(" %.4f, %.4f, %.4f, %.4f, %.4f" % (metrics["acc_macro"], metrics["prec_macro"], metrics["rec_macro"], metrics["f1_macro"], metrics["auc_macro"]))
print("[MICRO] accuracy, precision, recall, f-measure, AUC")
print(" %.4f, %.4f, %.4f, %.4f, %.4f" % (metrics["acc_micro"], metrics["prec_micro"], metrics["rec_micro"], metrics["f1_micro"], metrics["auc_micro"]))
print()
Main
class MyDataset(Dataset):
def __init__(self, X):
self.X = X
def __len__(self):
return len(self.X)
def __getitem__(self, idx):
return self.X[idx]
class Main:
def __init__(self, args):
if args['random_seed'] != 0:
random.seed(args['random_seed'])
np.random.seed(args['random_seed'])
torch.manual_seed(args['random_seed'])
torch.cuda.manual_seed_all(args['random_seed'])
dicts = self.load_lookups(args)
if args['model'] == 'MultiResedualCNN':
model = MultiResedualCNN(args, Y, dicts)
if args['model'] == 'Tr_Bert_V3':
model = Tr_Bert_V3(args, Y, dicts)
print(model)
if args['model'] == 'MultiResedualCNN':
optimizer = optim.Adam(model.parameters(), weight_decay=args['weight_decay'], lr=args['lr'])
else:
optimizer = optim.AdamW(params=model.parameters(), lr=args['lr'])
metrics_hist = defaultdict(lambda: [])
metrics_hist_te = defaultdict(lambda: [])
metrics_hist_tr = defaultdict(lambda: [])
if args['model'].find("Bert") != -1:
prepare_instance_func = self.prepare_instance_bert
else:
prepare_instance_func = self.prepare_instance
train_instances = prepare_instance_func(dicts, args['data_path'], args, args['MAX_LENGTH'])
dev_instances = prepare_instance_func(dicts, args['data_path'].replace('train','dev'), args, args['MAX_LENGTH'])
test_instances = prepare_instance_func(dicts, args['data_path'].replace('train','test'), args, args['MAX_LENGTH'])
if args['model'].find("Bert") != -1:
collate_func = self.my_collate_bert
else:
collate_func = self.my_collate
train_loader = DataLoader(MyDataset(train_instances), args['batch_size'], shuffle=True, collate_fn=collate_func)
dev_loader = DataLoader(MyDataset(dev_instances), 1, shuffle=False, collate_fn=collate_func)
test_loader = DataLoader(MyDataset(test_instances), 1, shuffle=False, collate_fn=collate_func)
train_test = Train_Test()
for epoch in range(args['n_epochs']):
epoch_start = time.time()
losses = train_test.train(args, model, optimizer, epoch, args['gpu'], train_loader)
loss = np.mean(losses)
epoch_finish = time.time()
print("epoch finish in %.2fs, loss: %.4f" % (epoch_finish - epoch_start, loss))
# test on dev
evaluation_start = time.time()
metrics = train_test.test(args, model, args['data_path'], "dev", args['gpu'], dicts, dev_loader)
evaluation_finish = time.time()
print("evaluation finish in %.2fs" % (evaluation_finish - evaluation_start))
if epoch == args['n_epochs'] - 1:
metrics_te = train_test.test(args, model, args['data_path'], "test", args['gpu'], dicts, test_loader)
def prepare_instance(self, dicts, filename, args, max_length):
csv.field_size_limit(sys.maxsize)
ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
instances = []
num_labels = len(dicts['ind2c'])
with open(filename, 'r') as infile:
r = csv.reader(infile)
#header
next(r)
for row in tqdm(r):
text = row[2]
labels_idx = np.zeros(num_labels)
labelled = False
for l in row[3].split(' '):
if l in c2ind.keys():
code = int(c2ind[l])
labels_idx[code] = 1
labelled = True
if not labelled:
continue
tokens_ = text.split()
tokens = []
tokens_id = []
for token in tokens_:
tokens.append(token)
token_id = w2ind[token] if token in w2ind else len(w2ind) + 1
tokens_id.append(token_id)
if len(tokens) > max_length:
tokens = tokens[:max_length]
tokens_id = tokens_id[:max_length]
dict_instance = {'label': labels_idx,
'tokens': tokens,
"tokens_id": tokens_id}
instances.append(dict_instance)
return instances
def prepare_instance_bert(self, dicts, filename, args, max_length):
csv.field_size_limit(sys.maxsize)
ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
instances = []
num_labels = len(dicts['ind2c'])
wp_tokenizer = tr.BertTokenizer.from_pretrained(args['pretrained_bert'], do_lower_case=True)
with open(filename, 'r') as infile:
r = csv.reader(infile)
#header
next(r)
for row in tqdm(r):
text = row[2]
labels_idx = np.zeros(num_labels)
labelled = False
for l in row[3].split(' '):
if l in c2ind.keys():
code = int(c2ind[l])
labels_idx[code] = 1
labelled = True
if not labelled:
continue
tokens = wp_tokenizer.tokenize(text)
tokens = tokens[:max_length-2]
tokens.insert(0, '[CLS]')
tokens.append('[SEP]')
tokens_id = wp_tokenizer.convert_tokens_to_ids(tokens)
masks = [1] * len(tokens)
segments = [0] * len(tokens)
dict_instance = {'label':labels_idx, 'tokens':tokens,
"tokens_id":tokens_id, "segments":segments, "masks":masks}
instances.append(dict_instance)
return instances
def my_collate(self, x):
words = [x_['tokens_id'] for x_ in x]
seq_len = [len(w) for w in words]
max_seq_len = max(seq_len)
inputs_id = self.pad_sequence(words, max_seq_len)
labels = [x_['label'] for x_ in x]
return inputs_id, labels
def my_collate_bert(self, x):
words = [x_['tokens_id'] for x_ in x]
segments = [x_['segments'] for x_ in x]
masks = [x_['masks'] for x_ in x]
seq_len = [len(w) for w in words]
max_seq_len = max(seq_len)
inputs_id = self.pad_sequence(words, max_seq_len)
segments = self.pad_sequence(segments, max_seq_len)
masks = self.pad_sequence(masks, max_seq_len)
labels = [x_['label'] for x_ in x]
return inputs_id, segments, masks, labels
def pad_sequence(self, x, max_len, type=np.int):
padded_x = np.zeros((len(x), max_len), dtype=type)
for i, row in enumerate(x):
padded_x[i][:len(row)] = row
return padded_x
For the Bert model I am using these below values:
args[‘model’] = ‘Tr_Bert_V3’
args[‘pretrained_bert’] = ‘bert-base-uncased’
args[‘n_epochs’] = 3
args[‘MAX_LENGTH’] = 512
args[‘lr’] = 5e-5
Result:
EPOCH 0
100%|██████████| 3750/3750 [15:54<00:00, 3.93it/s]
epoch finish in 954.40s, loss: 0.3254
100%|██████████| 7908/7908 [02:57<00:00, 44.55it/s]
[MACRO] accuracy, precision, recall, f-measure, AUC
0.0000, 0.0000, 0.0000, 0.0000, 0.5005
[MICRO] accuracy, precision, recall, f-measure, AUC
0.0000, 0.0000, 0.0000, 0.0000, 0.8582