Loss and accuracy decreasing

I’m trying to train an LSTM for classification on Yelp data. I see the training loss decrease, however the accuracy starts decreasing after 3 epochs (peaks around 30%, when a simple baseline like NB gets 53%) and it overfits around the 5th epoch. I assumed something was wrong with the “accuratePredictions” function, but that doesn’t appear to be the case. In that case it seems like the model is the culprit, but it’s hard to tell. So far I’ve checked:

  • The dataset and preprocessing of the strings
  • Custom iterator(I had issues with Dataloader cutting up the arrays of words and mixing them together)
  • Indexing and padding are fine (The vocab class can recreate the original sentence)
  • Embedding looks OK (Hard to tell)
  • Outputs seem fine, but again hard to tell

Below is my code. I would greatly appreciate any ideas or suggestions!

import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import numpy as np
from torch.utils.data import DataLoader
import pickle
from utils import processText

def processer(text):
	return processText(text, tokenize_by='regex')
	
class YelpDataset(Dataset):
	def __init__(self, csv_file, transform=None):
		self.data_frame = pd.read_csv(csv_file)
		self.transform = transform

	def __len__(self):
		return len(self.data_frame)

	def __getitem__(self, idx):
		if torch.is_tensor(idx):
			idx = idx.tolist()

		sample = processer(self.data_frame.iloc[idx]['text'])
		value = self.data_frame.iloc[idx]['stars']
		if self.transform:
			sample = self.transform(sample)

		return sample,value

class YelpDataloader():
	def __init__(self, batch_size, dataset):
		self.batch_size = batch_size
		self.num_items = len(dataset)
		self.dataset = dataset
		self.pos = 0

	def __iter__(self):
		return self
	def __next__(self):
		if self.pos < self.num_items:
			self.pos += self.batch_size
			data = [ self.dataset[i][0] for i in range(self.pos - self.batch_size, min(self.num_items, self.pos))]
			label = [ self.dataset[i][1] for i in range(self.pos - self.batch_size, min(self.num_items, self.pos))]
			return data, label
		else:
			raise StopIteration
		
class Vocab:
	def __init__(self):
		self.padding_idx = 0
		self.unk_idx = 1
		self.start_idx = 2
		self.stop_idx = 3
		self.idxLUT = {"<PAD>": self.padding_idx, "<UNK>": self.unk_idx , "<S>": self.start_idx , "</S>": self.stop_idx}
		self.vocabLUT = {val:key for  key,val in self.idxLUT.items()}

	def __len__(self):
		return len(self.idxLUT)

	def __getitem__(self, item):
		if type(item) == str:
			return self.idxLUT.get(item, self.unk_idx)
		else:
			return self.vocabLUT.get(item, "<UNK>")

	def extractVocab(self, texts):
		words = set([word for text in texts for word in text])
		for count, uniqueWord in enumerate(words):
			self.idxLUT[uniqueWord] = count + self.stop_idx + 1
		
		self.vocabLUT = {val: key for  key,val in self.idxLUT.items()}

	# Maybe replace with a JSON eventualy
	def save(self, filename = "vocab"):
		with open(filename + '.pkl', 'wb') as filehandler:
			pickle.dump(self.__dict__, filehandler,2)

	def load(self, filename = "vocab"):
		with open(filename + '.pkl', 'rb') as filehandler:
			tmp_dict = pickle.load(filehandler)
			self.__dict__.update(tmp_dict)

class YelpClassifier(torch.nn.Module):
	def __init__(self, vocab, embed=None,**args):
		super(YelpClassifier, self).__init__()
		self.vocab = vocab
		if embed:
			self.embed = embed
		else:
			self.embed = torch.nn.Embedding(len(self.vocab), args['emb_size'], padding_idx=3)
		# Add LSTM and GRU
		self.rnn = torch.nn.LSTM(args['emb_size'], args['hid_size'],batch_first=True)
		self.output = torch.nn.Linear(args['hid_size'], args['num_classes'])

	def last_timestep(self, unpacked, lengths):
		# Index of the last output for each sequence.
		idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
											   unpacked.size(2)).unsqueeze(1)
		return unpacked.gather(1, idx).squeeze()
	
	def _str_to_tensor(self, data):
		lengths = [len(x) for x in data]
		batch_size = len(lengths)
		max_len = max(lengths)
		padded_data = torch.zeros([batch_size,max_len]).long().cuda()
		
		for i in range(batch_size):
			padded_data[i, :lengths[i]] = torch.Tensor([self.vocab[word] for word in data[i]]).long().cuda()
		return padded_data, torch.tensor(lengths).cuda()

	def forward(self, review):
		padded_idxs, lengths = self._str_to_tensor(review)
		embeddings = self.embed(padded_idxs)
		packed = torch.nn.utils.rnn.pack_padded_sequence(embeddings, lengths,batch_first=True, enforce_sorted=False)
		out_packed, _ = self.rnn(packed)
		out_unpacked, _ = torch.nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
		last_hidden_state = self.last_timestep(out_unpacked, lengths)
		logits = self.output(last_hidden_state)
		return logits

def accuratePredictions(prediction, ground_truth):
	return torch.sum(torch.argmax(nn.Softmax(prediction,dim=-1),dim=1) == ground_truth).double().item()

if __name__ == "__main__":
	train_dataset = YelpDataset("yelp_ntrain.csv")
	test_dataset = YelpDataset("yelp_nval.csv")
	vocab = Vocab()
	vocab.load()
	
	cuda0 = torch.device('cuda:0')
	
	bestValAcc = 0
	
	hyperParams = {'emb_size': 150, 'hid_size': 300, 'num_classes': 5}
	classifier = YelpClassifier(vocab, **hyperParams).to(cuda0)
	loss = torch.nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
	#for sample in train_dataset:
	optim = torch.optim.Adam(classifier.parameters())
	for epoch in range(10):
		trainingLoss  = 0
		valLoss = 0
		valAccuracy = 0
		trainAccuracy = 0
		for current_val, data in enumerate(YelpDataloader(64, train_dataset)):
			#import pdb;pdb.set_trace()
			classifier.train()
			if current_val% 100 == 0:
				print("Current value", current_val)
			x = data[0]
			y = torch.tensor(data[1]).to(cuda0)
			optim.zero_grad()
			output = classifier(x)
			trainAccuracy += accuratePredictions(output,y)
			epochLoss = loss(output, y - 1)
			trainingLoss += epochLoss.item()
			
			epochLoss.backward()
			optim.step()
		with torch.no_grad():
			numclass = 0
			for current_val, data in enumerate(YelpDataloader(64, test_dataset)):
				classifier.eval()
				if current_val% 100 == 0:
					print("Current valid: ", current_val)
				x = data[0]
				y = torch.tensor(data[1]).to(cuda0)
				output = classifier(x)
				valAccuracy += accuratePredictions(output,y)
				epochLoss = loss(output, y - 1)
				valLoss += epochLoss.item()
		print("Epoch training loss per item: ", trainingLoss/len(train_dataset))
		print("Epoch validation loss loss per item: ", valLoss/len(test_dataset))
		print("Epoch training accuracy: ", trainAccuracy/len(train_dataset))
		print("Epoch val accuracy: ", valAccuracy/len(test_dataset))
		if bestValAcc < valAccuracy/len(test_dataset):
			bestValAcc = valAccuracy/len(test_dataset)
			torch.save(classifier.state_dict(), "model.bin")