Getting zero accuracy in Bert model

Hi everyone,
I am new to NLP and Pytorch. I am trying to solve a multi-class text classification problem. I have tried with two models one is a Multi-filter CNN network model and the other one is a simple Bert classifier model. For the Bert model, I have used the “bert-base-uncased” pre-trained model. However, I am always getting zero accuracies for the Bert model but not for the CNN model. For the CNN model, I have used “word2vec” embedding. I want to use the bert model for better result but I am not getting any success as of now. Maybe I am doing something wrong. I am attaching the whole code for both the models and the training and testing process below. Please suggest to me where I am doing wrong for the Bert model.

Models

class WordRep(nn.Module):

	def __init__(self, args, Y, dicts):
		super(WordRep, self).__init__()

		self.gpu = args['gpu']

		W = torch.Tensor(self.load_embeddings(args['embed_file']))

		self.embed = nn.Embedding(W.size()[0], W.size()[1], padding_idx=0)
		self.embed.weight.data = W.clone()
		
		self.feature_size = self.embed.embedding_dim

		self.embed_drop = nn.Dropout(p=args['dropout'])

		self.conv_dict = {
					1: [self.feature_size, args['num_filter_maps']],
					2: [self.feature_size, 100, args['num_filter_maps']],
					3: [self.feature_size, 150, 100, args['num_filter_maps']],
					4: [self.feature_size, 200, 150, 100, args['num_filter_maps']]
					 }


	def forward(self, x, target):

		features = [self.embed(x)]

		x = torch.cat(features, dim=2)

		x = self.embed_drop(x)
		return x
	
	def load_embeddings(self, embed_file):
		#also normalizes the embeddings
		W = []
		with open(embed_file) as ef:
			for line in ef:
				line = line.rstrip().split()
				vec = np.array(line[1:]).astype(np.float)
				vec = vec / float(np.linalg.norm(vec) + 1e-6)
				W.append(vec)
			#UNK embedding, gaussian randomly initialized
			print("adding unk embedding")
			vec = np.random.randn(len(W[-1]))
			vec = vec / float(np.linalg.norm(vec) + 1e-6)
			W.append(vec)
		W = np.array(W)
		return W

class OutputLayer(nn.Module):

	def __init__(self, args, Y, dicts, input_size):
		super(OutputLayer, self).__init__()

		self.U = nn.Linear(input_size, Y)
		xavier_uniform(self.U.weight)


		self.final = nn.Linear(input_size, Y)
		xavier_uniform(self.final.weight)

		self.loss_function = nn.BCEWithLogitsLoss()



	def forward(self, x, target):

		alpha = F.softmax(self.U.weight.matmul(x.transpose(1, 2)), dim=2)

		m = alpha.matmul(x)

		y = self.final.weight.mul(m).sum(dim=2).add(self.final.bias)

		loss = self.loss_function(y, target)
		return y, loss

class ResidualBlock(nn.Module):

	def __init__(self, inchannel, outchannel, kernel_size, stride, use_res, dropout):
		super(ResidualBlock, self).__init__()
		self.left = nn.Sequential(
			nn.Conv1d(inchannel, outchannel, kernel_size=kernel_size, stride=stride, padding=int(floor(kernel_size / 2)), bias=False),
			nn.BatchNorm1d(outchannel),
			nn.Tanh(),
			nn.Conv1d(outchannel, outchannel, kernel_size=kernel_size, stride=1, padding=int(floor(kernel_size / 2)), bias=False),
			nn.BatchNorm1d(outchannel)
		)

		self.use_res = use_res
		if self.use_res:
			self.shortcut = nn.Sequential(
						nn.Conv1d(inchannel, outchannel, kernel_size=1, stride=stride, bias=False),
						nn.BatchNorm1d(outchannel)
					)

		self.dropout = nn.Dropout(p=dropout)

	def forward(self, x):
		out = self.left(x)
		if self.use_res:
			out += self.shortcut(x)   
		out = torch.tanh(out)
		out = self.dropout(out)
		return out


class MultiResedualCNN(nn.Module):

	def __init__(self, args, Y, dicts):
		super(MultiResedualCNN, self).__init__()

		self.word_rep = WordRep(args, Y, dicts)
		self.embedding_size = self.word_rep.embed.weight.data.size()[0]
		
		self.conv = nn.ModuleList()
		filter_sizes = args['filter_size'].split(',')

		self.filter_num = len(filter_sizes)
		for filter_size in filter_sizes:
			filter_size = int(filter_size)
			one_channel = nn.ModuleList()
			
			
			tmp = nn.Conv1d(self.word_rep.feature_size, self.word_rep.feature_size, kernel_size=filter_size,
							padding=int(floor(filter_size / 2)))
			xavier_uniform(tmp.weight)
			one_channel.add_module('baseconv', tmp)

			conv_dimension = self.word_rep.conv_dict[args['conv_layer']]
			for idx in range(args['conv_layer']):
				tmp = ResidualBlock(conv_dimension[idx], conv_dimension[idx + 1], filter_size, 1, True,
									args['dropout'])
				one_channel.add_module('resconv-{}'.format(idx), tmp)
			
			self.conv.add_module('channel-{}'.format(filter_size), one_channel)

		self.output_layer = OutputLayer(args, Y, dicts, self.filter_num * args['num_filter_maps'])


	def forward(self, x, target):

		x = self.word_rep(x, target)

		x = x.transpose(1, 2)

		conv_result = []
		for conv in self.conv:
			tmp = x
			for idx, md in enumerate(conv):
				if idx == 0:
					tmp = torch.tanh(md(tmp))
				else:    
					tmp = md(tmp)
			tmp = tmp.transpose(1, 2)
			conv_result.append(tmp)
		x = torch.cat(conv_result, dim=2)

		y, loss = self.output_layer(x, target)

		return y, loss

	def freeze_net(self):
		for p in self.word_rep.embed.parameters():
			p.requires_grad = False


class Tr_Bert_V3(nn.Module):

	def __init__(self, args, Y, dicts):
		super(Tr_Bert_V3, self).__init__()

		cache_path = os.path.join(args['bert_dir'], args['pretrained_bert'])
		
		savedModel = None
		if os.path.exists(cache_path):
			savedModel = tr.BertModel.from_pretrained(cache_path, return_dict=True)
		else:
			savedModel = tr.BertModel.from_pretrained(str(args['pretrained_bert']), return_dict=True)
			savedModel.save_pretrained(save_directory = cache_path, save_config=True)
		self.bert = savedModel
		self.config = savedModel.config
		print("Model config {}".format(self.config))
		
		self.dropout = nn.Dropout(self.config.hidden_dropout_prob)
		
		self.classifier = nn.Linear(self.config.hidden_size , Y)
		
		self.apply(self.init_bert_weights)
		

	def forward(self, input_ids, token_type_ids, attention_mask, target):
	
		output = self.bert(input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask)
		
		x = self.dropout(output.pooler_output)

		x = x / float(torch.linalg.norm(x) + 1e-6)

		y = self.classifier(x)
		loss = self.loss_fn(y, target)
		
		return y, loss

	def loss_fn(self, outputs, target):
	  return nn.BCEWithLogitsLoss()(outputs, target)

	def init_bert_weights(self, module):
		BertLayerNorm = torch.nn.LayerNorm
		
		if isinstance(module, (nn.Linear, nn.Embedding)):
			module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
		elif isinstance(module, BertLayerNorm):
			module.bias.data.zero_()
			module.weight.data.fill_(1.0)
		if isinstance(module, nn.Linear) and module.bias is not None:
			module.bias.data.zero_()

	def freeze_net(self):
		pass

Train-Test

class Train_Test:

	def __init__(self):
		print("Train--Test")
	
	def train(self, args, model, optimizer, epoch, gpu, data_loader):
		print("EPOCH %d" % epoch)

		losses = []


		model.train()

		# loader
		data_iter = iter(data_loader)
		num_iter = len(data_loader)
		
		for i in tqdm(range(num_iter)):
			optimizer.zero_grad()

			if args['model'].find("Bert") != -1:

				inputs_id, segments, masks, labels = next(data_iter)

				inputs_id, segments, masks, labels = torch.LongTensor(inputs_id).cuda(gpu), torch.LongTensor(segments).cuda(gpu), \
													 torch.LongTensor(masks).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)
				
				
				output, loss = model(inputs_id, segments, masks, labels)
				
			else:

				inputs_id, labels = next(data_iter)

				inputs_id, labels = torch.LongTensor(inputs_id).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)

				output, loss = model(inputs_id, labels)

			loss.backward()
			torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
			optimizer.step()

			losses.append(loss.item())

		return losses
	
	def test(self, args, model, data_path, fold, gpu, dicts, data_loader):
		self.model_name = args['model']

		num_labels = len(dicts['ind2c'])

		y, yhat, yhat_raw, hids, losses = [], [], [], [], []

		model.eval()

		# loader
		data_iter = iter(data_loader)
		num_iter = len(data_loader)
		for i in tqdm(range(num_iter)):
			with torch.no_grad():

				if args['model'].find("Bert") != -1:
					inputs_id, segments, masks, labels = next(data_iter)

					inputs_id, segments, masks, labels = torch.LongTensor(inputs_id).cuda(gpu), torch.LongTensor(segments).cuda(gpu),\
														 torch.LongTensor(masks).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)
					
				 
					output, loss = model(inputs_id, segments, masks, labels)
				else:

					inputs_id, labels = next(data_iter)

					inputs_id, labels, = torch.LongTensor(inputs_id).cuda(gpu), torch.FloatTensor(labels).cuda(gpu)

					output, loss = model(inputs_id, labels)

				output = torch.sigmoid(output)
				output = output.data.cpu().numpy()

				losses.append(loss.item())
				target_data = labels.data.cpu().numpy()

				yhat_raw.append(output)
				
				output = np.round(output)
				
				y.append(target_data)
				
				yhat.append(output)

		y = np.concatenate(y, axis=0)
		yhat = np.concatenate(yhat, axis=0)
		yhat_raw = np.concatenate(yhat_raw, axis=0)
		
		self.print_metrics(yhat, y, yhat_raw=yhat_raw)
		
	def print_metrics(self, yhat, y, yhat_raw):
		"""
			Inputs:
				yhat: binary predictions matrix
				y: binary ground truth matrix
				yhat_raw: prediction scores matrix (floats)
			Outputs:
				dict holding relevant metrics
		"""
		names = ["acc", "prec", "rec", "f1"]

		#macro
		macro = self.all_macro(yhat, y)
		
		#micro
		ymic = y.ravel()
		yhatmic = yhat.ravel()
		micro = self.all_micro(yhatmic, ymic)

		metrics = {names[i] + "_macro": macro[i] for i in range(len(macro))}
		metrics.update({names[i] + "_micro": micro[i] for i in range(len(micro))})

		roc_auc = self.auc_metrics(yhat_raw, y, ymic)
		metrics.update(roc_auc)
		self.print_result(metrics)

	
	def auc_metrics(self, yhat_raw, y, ymic):
		if yhat_raw.shape[0] <= 1:
			return
		fpr = {}
		tpr = {}
		roc_auc = {}
		#get AUC for each label individually
		relevant_labels = []
		auc_labels = {}
		for i in range(y.shape[1]):
			#only if there are true positives for this label
			if y[:,i].sum() > 0:
				fpr[i], tpr[i], _ = roc_curve(y[:,i], yhat_raw[:,i])
				if len(fpr[i]) > 1 and len(tpr[i]) > 1:
					auc_score = auc(fpr[i], tpr[i])
					if not np.isnan(auc_score):
						auc_labels["auc_%d" % i] = auc_score
						relevant_labels.append(i)

		#macro-AUC: just average the auc scores
		aucs = []
		for i in relevant_labels:
			aucs.append(auc_labels['auc_%d' % i])
		roc_auc['auc_macro'] = np.mean(aucs)

		#micro-AUC: just look at each individual prediction
		yhatmic = yhat_raw.ravel()
		fpr["micro"], tpr["micro"], _ = roc_curve(ymic, yhatmic)
		roc_auc["auc_micro"] = auc(fpr["micro"], tpr["micro"])

		return roc_auc
	

	
	def all_micro(self, yhatmic, ymic):
		return self.micro_accuracy(yhatmic, ymic), self.micro_precision(yhatmic, ymic), self.micro_recall(yhatmic, ymic), self.micro_f1(yhatmic, ymic)
	
	def micro_f1(self, yhatmic, ymic):
		prec = self.micro_precision(yhatmic, ymic)
		rec = self.micro_recall(yhatmic, ymic)
		if prec + rec == 0:
			f1 = 0.
		else:
			f1 = 2*(prec*rec)/(prec+rec)
		return f1
	
	def micro_recall(self, yhatmic, ymic):
		return self.intersect_size(yhatmic, ymic, 0) / (ymic.sum(axis=0) + 1e-10) #NaN fix
	
	def micro_precision(self, yhatmic, ymic):
		return self.intersect_size(yhatmic, ymic, 0) / (yhatmic.sum(axis=0) + 1e-10) #NaN fix
	
	def micro_accuracy(self, yhatmic, ymic):
		return self.intersect_size(yhatmic, ymic, 0) / (self.union_size(yhatmic, ymic, 0) + 1e-10) #NaN fix
	
	def all_macro(self,yhat, y):
		return self.macro_accuracy(yhat, y), self.macro_precision(yhat, y), self.macro_recall(yhat, y), self.macro_f1(yhat, y)
	
	def macro_f1(self, yhat, y):
		prec = self.macro_precision(yhat, y)
		rec = self.macro_recall(yhat, y)
		if prec + rec == 0:
			f1 = 0.
		else:
			f1 = 2*(prec*rec)/(prec+rec)
		return f1
	
	def macro_recall(self, yhat, y):
		num = self.intersect_size(yhat, y, 0) / (y.sum(axis=0) + 1e-10)
		return np.mean(num)
	
	def macro_precision(self, yhat, y):
		num = self.intersect_size(yhat, y, 0) / (yhat.sum(axis=0) + 1e-10)
		return np.mean(num)
	
	def macro_accuracy(self, yhat, y):
		num = self.intersect_size(yhat, y, 0) / (self.union_size(yhat, y, 0) + 1e-10)
		return np.mean(num)
	
	def intersect_size(self, yhat, y, axis):
		#axis=0 for label-level union (macro). axis=1 for instance-level
		return np.logical_and(yhat, y).sum(axis=axis).astype(float)
	
	def union_size(self, yhat, y, axis):
		#axis=0 for label-level union (macro). axis=1 for instance-level
		return np.logical_or(yhat, y).sum(axis=axis).astype(float)
	
	def print_result(self, metrics):
		print()
		
		print("[MACRO] accuracy, precision, recall, f-measure, AUC")
		print("   %.4f, %.4f, %.4f, %.4f, %.4f" % (metrics["acc_macro"], metrics["prec_macro"], metrics["rec_macro"], metrics["f1_macro"], metrics["auc_macro"]))

		print("[MICRO] accuracy, precision, recall, f-measure, AUC")
		print("   %.4f, %.4f, %.4f, %.4f, %.4f" % (metrics["acc_micro"], metrics["prec_micro"], metrics["rec_micro"], metrics["f1_micro"], metrics["auc_micro"]))
	   
		print()

Main

class MyDataset(Dataset):

	def __init__(self, X):
		self.X = X


	def __len__(self):
		return len(self.X)

	def __getitem__(self, idx):
		return self.X[idx]




class Main:

	def __init__(self, args):
		if args['random_seed'] != 0:
			random.seed(args['random_seed'])
			np.random.seed(args['random_seed'])
			torch.manual_seed(args['random_seed'])
			torch.cuda.manual_seed_all(args['random_seed'])
		
		dicts = self.load_lookups(args)

		if args['model'] == 'MultiResedualCNN':
			model = MultiResedualCNN(args, Y, dicts)
		if args['model'] == 'Tr_Bert_V3':
			model = Tr_Bert_V3(args, Y, dicts)
		print(model)
		
		if args['model'] == 'MultiResedualCNN':
			optimizer = optim.Adam(model.parameters(), weight_decay=args['weight_decay'], lr=args['lr'])
		else:
			optimizer = optim.AdamW(params=model.parameters(), lr=args['lr'])
			

		
		metrics_hist = defaultdict(lambda: [])
		metrics_hist_te = defaultdict(lambda: [])
		metrics_hist_tr = defaultdict(lambda: [])

		if args['model'].find("Bert") != -1:
			prepare_instance_func = self.prepare_instance_bert
		else:
			prepare_instance_func = self.prepare_instance
			
		train_instances = prepare_instance_func(dicts, args['data_path'], args, args['MAX_LENGTH'])
		
		dev_instances = prepare_instance_func(dicts, args['data_path'].replace('train','dev'), args, args['MAX_LENGTH'])
			
		test_instances = prepare_instance_func(dicts, args['data_path'].replace('train','test'), args, args['MAX_LENGTH'])
		
		if args['model'].find("Bert") != -1:
			collate_func = self.my_collate_bert
		else:
			collate_func = self.my_collate
		
		train_loader = DataLoader(MyDataset(train_instances), args['batch_size'], shuffle=True, collate_fn=collate_func)

		dev_loader = DataLoader(MyDataset(dev_instances), 1, shuffle=False, collate_fn=collate_func)
			
		test_loader = DataLoader(MyDataset(test_instances), 1, shuffle=False, collate_fn=collate_func)
		
		
		train_test = Train_Test()

		for epoch in range(args['n_epochs']):

			epoch_start = time.time()
			losses = train_test.train(args, model, optimizer, epoch, args['gpu'], train_loader)
			loss = np.mean(losses)
			epoch_finish = time.time()
			print("epoch finish in %.2fs, loss: %.4f" % (epoch_finish - epoch_start, loss))

			# test on dev
			evaluation_start = time.time()
			metrics = train_test.test(args, model, args['data_path'], "dev", args['gpu'], dicts, dev_loader)
			evaluation_finish = time.time()
			print("evaluation finish in %.2fs" % (evaluation_finish - evaluation_start))
			if epoch == args['n_epochs'] - 1:
				metrics_te = train_test.test(args, model, args['data_path'], "test", args['gpu'], dicts, test_loader)


	def prepare_instance(self, dicts, filename, args, max_length):
		csv.field_size_limit(sys.maxsize)
		ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
		instances = []
		num_labels = len(dicts['ind2c'])

		with open(filename, 'r') as infile:
			r = csv.reader(infile)
			#header
			next(r)

			for row in tqdm(r):

				text = row[2] 

				labels_idx = np.zeros(num_labels)
				labelled = False

				for l in row[3].split(' '): 
					if l in c2ind.keys():
						code = int(c2ind[l])
						labels_idx[code] = 1
						labelled = True
				if not labelled:
					continue

				tokens_ = text.split()
				tokens = []
				tokens_id = []
				for token in tokens_:
					tokens.append(token)
					token_id = w2ind[token] if token in w2ind else len(w2ind) + 1
					tokens_id.append(token_id)

				if len(tokens) > max_length:
					tokens = tokens[:max_length]
					tokens_id = tokens_id[:max_length]

				dict_instance = {'label': labels_idx,
									 'tokens': tokens,
									 "tokens_id": tokens_id}

				instances.append(dict_instance)

		return instances
	
	def prepare_instance_bert(self, dicts, filename, args, max_length):
		csv.field_size_limit(sys.maxsize)
		ind2w, w2ind, ind2c, c2ind = dicts['ind2w'], dicts['w2ind'], dicts['ind2c'], dicts['c2ind']
		instances = []
		num_labels = len(dicts['ind2c'])
		
		wp_tokenizer = tr.BertTokenizer.from_pretrained(args['pretrained_bert'], do_lower_case=True)


		with open(filename, 'r') as infile:
			r = csv.reader(infile)
			#header
			next(r)

			for row in tqdm(r):
				
				text = row[2] 

				labels_idx = np.zeros(num_labels)
				labelled = False

				for l in row[3].split(' '):
					if l in c2ind.keys():
						code = int(c2ind[l])
						labels_idx[code] = 1
						labelled = True
				if not labelled:
					continue

				tokens = wp_tokenizer.tokenize(text)
				tokens = tokens[:max_length-2]
				tokens.insert(0, '[CLS]')
				tokens.append('[SEP]')

				tokens_id = wp_tokenizer.convert_tokens_to_ids(tokens)
				masks = [1] * len(tokens)
				segments = [0] * len(tokens)

	
				dict_instance = {'label':labels_idx, 'tokens':tokens,
								 "tokens_id":tokens_id, "segments":segments, "masks":masks}

				instances.append(dict_instance)

		return instances
	
	def my_collate(self, x):
		words = [x_['tokens_id'] for x_ in x]
		
		seq_len = [len(w) for w in words]
		max_seq_len = max(seq_len)

		inputs_id = self.pad_sequence(words, max_seq_len)

		labels = [x_['label'] for x_ in x]
		

		return inputs_id, labels
   

	def my_collate_bert(self, x):
		words = [x_['tokens_id'] for x_ in x]
		segments = [x_['segments'] for x_ in x]
		masks = [x_['masks'] for x_ in x]
		
		seq_len = [len(w) for w in words]
		max_seq_len = max(seq_len)
		
		inputs_id = self.pad_sequence(words, max_seq_len)
		segments = self.pad_sequence(segments, max_seq_len)
		masks = self.pad_sequence(masks, max_seq_len)


		labels = [x_['label'] for x_ in x]

		return inputs_id, segments, masks, labels
	
	
	def pad_sequence(self, x, max_len, type=np.int):

		padded_x = np.zeros((len(x), max_len), dtype=type)
		for i, row in enumerate(x):
			padded_x[i][:len(row)] = row

		return padded_x

For the Bert model I am using these below values:

args[‘model’] = ‘Tr_Bert_V3’
args[‘pretrained_bert’] = ‘bert-base-uncased’
args[‘n_epochs’] = 3
args[‘MAX_LENGTH’] = 512
args[‘lr’] = 5e-5

Result:

EPOCH 0
100%|██████████| 3750/3750 [15:54<00:00, 3.93it/s]
epoch finish in 954.40s, loss: 0.3254
100%|██████████| 7908/7908 [02:57<00:00, 44.55it/s]

[MACRO] accuracy, precision, recall, f-measure, AUC
0.0000, 0.0000, 0.0000, 0.0000, 0.5005
[MICRO] accuracy, precision, recall, f-measure, AUC
0.0000, 0.0000, 0.0000, 0.0000, 0.8582

It might be good to sanity check a single output of the model to see what is going on here or even what the print_metrics function is doing, as 0% accuracy is about as actually likely as 100% accuracy.

Hi @eqy ,
Thank you for your reply.
I have updated the code with the print_metrics function. As you suggested I will check the output values of the model. I am guessing the output is always 0 (after rounding the raw output) that’s why along with accuracy, I am getting 0 for precision, recall, and f-score.