I’m trying to train an LSTM for classification on Yelp data. I see the training loss decrease, however the accuracy starts decreasing after 3 epochs (peaks around 30%, when a simple baseline like NB gets 53%) and it overfits around the 5th epoch. I assumed something was wrong with the “accuratePredictions” function, but that doesn’t appear to be the case. In that case it seems like the model is the culprit, but it’s hard to tell. So far I’ve checked:
- The dataset and preprocessing of the strings
- Custom iterator(I had issues with Dataloader cutting up the arrays of words and mixing them together)
- Indexing and padding are fine (The vocab class can recreate the original sentence)
- Embedding looks OK (Hard to tell)
- Outputs seem fine, but again hard to tell
Below is my code. I would greatly appreciate any ideas or suggestions!
import os
import torch
import pandas as pd
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
import numpy as np
from torch.utils.data import DataLoader
import pickle
from utils import processText
def processer(text):
return processText(text, tokenize_by='regex')
class YelpDataset(Dataset):
def __init__(self, csv_file, transform=None):
self.data_frame = pd.read_csv(csv_file)
self.transform = transform
def __len__(self):
return len(self.data_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
sample = processer(self.data_frame.iloc[idx]['text'])
value = self.data_frame.iloc[idx]['stars']
if self.transform:
sample = self.transform(sample)
return sample,value
class YelpDataloader():
def __init__(self, batch_size, dataset):
self.batch_size = batch_size
self.num_items = len(dataset)
self.dataset = dataset
self.pos = 0
def __iter__(self):
return self
def __next__(self):
if self.pos < self.num_items:
self.pos += self.batch_size
data = [ self.dataset[i][0] for i in range(self.pos - self.batch_size, min(self.num_items, self.pos))]
label = [ self.dataset[i][1] for i in range(self.pos - self.batch_size, min(self.num_items, self.pos))]
return data, label
else:
raise StopIteration
class Vocab:
def __init__(self):
self.padding_idx = 0
self.unk_idx = 1
self.start_idx = 2
self.stop_idx = 3
self.idxLUT = {"<PAD>": self.padding_idx, "<UNK>": self.unk_idx , "<S>": self.start_idx , "</S>": self.stop_idx}
self.vocabLUT = {val:key for key,val in self.idxLUT.items()}
def __len__(self):
return len(self.idxLUT)
def __getitem__(self, item):
if type(item) == str:
return self.idxLUT.get(item, self.unk_idx)
else:
return self.vocabLUT.get(item, "<UNK>")
def extractVocab(self, texts):
words = set([word for text in texts for word in text])
for count, uniqueWord in enumerate(words):
self.idxLUT[uniqueWord] = count + self.stop_idx + 1
self.vocabLUT = {val: key for key,val in self.idxLUT.items()}
# Maybe replace with a JSON eventualy
def save(self, filename = "vocab"):
with open(filename + '.pkl', 'wb') as filehandler:
pickle.dump(self.__dict__, filehandler,2)
def load(self, filename = "vocab"):
with open(filename + '.pkl', 'rb') as filehandler:
tmp_dict = pickle.load(filehandler)
self.__dict__.update(tmp_dict)
class YelpClassifier(torch.nn.Module):
def __init__(self, vocab, embed=None,**args):
super(YelpClassifier, self).__init__()
self.vocab = vocab
if embed:
self.embed = embed
else:
self.embed = torch.nn.Embedding(len(self.vocab), args['emb_size'], padding_idx=3)
# Add LSTM and GRU
self.rnn = torch.nn.LSTM(args['emb_size'], args['hid_size'],batch_first=True)
self.output = torch.nn.Linear(args['hid_size'], args['num_classes'])
def last_timestep(self, unpacked, lengths):
# Index of the last output for each sequence.
idx = (lengths - 1).view(-1, 1).expand(unpacked.size(0),
unpacked.size(2)).unsqueeze(1)
return unpacked.gather(1, idx).squeeze()
def _str_to_tensor(self, data):
lengths = [len(x) for x in data]
batch_size = len(lengths)
max_len = max(lengths)
padded_data = torch.zeros([batch_size,max_len]).long().cuda()
for i in range(batch_size):
padded_data[i, :lengths[i]] = torch.Tensor([self.vocab[word] for word in data[i]]).long().cuda()
return padded_data, torch.tensor(lengths).cuda()
def forward(self, review):
padded_idxs, lengths = self._str_to_tensor(review)
embeddings = self.embed(padded_idxs)
packed = torch.nn.utils.rnn.pack_padded_sequence(embeddings, lengths,batch_first=True, enforce_sorted=False)
out_packed, _ = self.rnn(packed)
out_unpacked, _ = torch.nn.utils.rnn.pad_packed_sequence(out_packed, batch_first=True)
last_hidden_state = self.last_timestep(out_unpacked, lengths)
logits = self.output(last_hidden_state)
return logits
def accuratePredictions(prediction, ground_truth):
return torch.sum(torch.argmax(nn.Softmax(prediction,dim=-1),dim=1) == ground_truth).double().item()
if __name__ == "__main__":
train_dataset = YelpDataset("yelp_ntrain.csv")
test_dataset = YelpDataset("yelp_nval.csv")
vocab = Vocab()
vocab.load()
cuda0 = torch.device('cuda:0')
bestValAcc = 0
hyperParams = {'emb_size': 150, 'hid_size': 300, 'num_classes': 5}
classifier = YelpClassifier(vocab, **hyperParams).to(cuda0)
loss = torch.nn.CrossEntropyLoss(ignore_index=vocab['<PAD>'])
#for sample in train_dataset:
optim = torch.optim.Adam(classifier.parameters())
for epoch in range(10):
trainingLoss = 0
valLoss = 0
valAccuracy = 0
trainAccuracy = 0
for current_val, data in enumerate(YelpDataloader(64, train_dataset)):
#import pdb;pdb.set_trace()
classifier.train()
if current_val% 100 == 0:
print("Current value", current_val)
x = data[0]
y = torch.tensor(data[1]).to(cuda0)
optim.zero_grad()
output = classifier(x)
trainAccuracy += accuratePredictions(output,y)
epochLoss = loss(output, y - 1)
trainingLoss += epochLoss.item()
epochLoss.backward()
optim.step()
with torch.no_grad():
numclass = 0
for current_val, data in enumerate(YelpDataloader(64, test_dataset)):
classifier.eval()
if current_val% 100 == 0:
print("Current valid: ", current_val)
x = data[0]
y = torch.tensor(data[1]).to(cuda0)
output = classifier(x)
valAccuracy += accuratePredictions(output,y)
epochLoss = loss(output, y - 1)
valLoss += epochLoss.item()
print("Epoch training loss per item: ", trainingLoss/len(train_dataset))
print("Epoch validation loss loss per item: ", valLoss/len(test_dataset))
print("Epoch training accuracy: ", trainAccuracy/len(train_dataset))
print("Epoch val accuracy: ", valAccuracy/len(test_dataset))
if bestValAcc < valAccuracy/len(test_dataset):
bestValAcc = valAccuracy/len(test_dataset)
torch.save(classifier.state_dict(), "model.bin")