Hi,
I’m wanting to train a model to do sentiment classification on the imdb dataset. I downloaded the dataset and am “manually” reading and cleaning it. I figured this was better practice (as in me practicing to do these things) than using the ready-to-go dataset from torchtext.
I’m padding the tensors to make them all of equal length.
Unfortunately my model doesn’t seem to converge. I’m training it with 25000 samples, for 10 epochs with a batch-size of 100.
Here is how I clean-up the reviews:
def clean_text(text):
# throw everything but some chars
output = [char for char in text.lower() if char in string.ascii_lowercase+string.digits+string.whitespace+"<- "]
# additional cleanup
output = re.sub("[-]+", " ", "".join(output).strip().replace("<br", ""))
return output
Here’s my padding:
def pad_tensor(t):
t = torch.tensor(t)
padding = max(review_len) - t.size()[0]
t = torch.nn.functional.pad(t, (0, padding))
return t
my dataset:
class ImdbDataset(Dataset):
def __init__(self):
self.encoded_reviews = []
self.encoded_labels = all_labels
for review in all_reviews:
encoded = []
for word in review.split():
encoded.append(word_to_idx[word])
self.encoded_reviews.append(pad_tensor(encoded))
# decoded = []
# for idx in encoded:
# decoded.append(idx_to_word[idx])
# print(" ".join(decoded))
# break
def __len__(self):
return len(self.encoded_reviews)
def __getitem__(self, idx):
return self.encoded_reviews[idx], self.encoded_labels[idx]
my model:
class ImdbReviewModel(nn.Module):
def __init__(self, vocab_size, hidden_size):
super(ImdbReviewModel, self).__init__()
self.embedding_dims = 150
self.embed = nn.Embedding(vocab_size, self.embedding_dims)
self.gru = nn.GRU(self.embedding_dims, hidden_size)
self.fc1 = nn.Linear(hidden_size, 50)
self.fc2 = nn.Linear(50, 1)
def forward(self, x, h):
x = self.embed(x)
x, h = self.gru(x, h)
x = self.fc1(x)
x = self.fc2(x)
y_hat = torch.sigmoid(x)
return y_hat, h
training:
hidden_size = 128
model = ImdbReviewModel(n_words, hidden_size)
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
epochs = 100
all_losses = []
h0 = torch.zeros(1, batch_size, hidden_size)
for epoch in range(1, epochs+1):
current_loss = 0
current_avg_loss = 0
for data in train_loader:
x, y = data[0].to(device), data[1].to(device)
h = h0.to(device)
x = x.permute(1, 0)
optimizer.zero_grad()
y_hat, h = model(x, h)
y_hat = y_hat[-1].squeeze()
loss = criterion(y_hat, y.float())
loss.backward()
optimizer.step()
current_loss += loss.item()
current_avg_loss = current_loss / len(train_loader)
all_losses.append(current_avg_loss)
print('%d, progress:%d%%, loss: %.4f' % (epoch, epoch / epochs * 100, current_avg_loss))
For completeness sake, here’s the entire code:
import os
import re
import string
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from torch.utils.data import DataLoader, Dataset, random_split
class_to_idx = {"pos": 1, "neg": 0}
n_classes = 2
all_words = set()
all_reviews = []
all_labels = []
word_to_idx = {}
idx_to_word = {}
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
"""LOAD DATA"""
def clean_text(text):
# throw everything but some chars
output = [char for char in text.lower() if char in string.ascii_lowercase+string.digits+string.whitespace+"<- "]
# additional cleanup
output = re.sub("[-]+", " ", "".join(output).strip().replace("<br", ""))
return output
def read_files(filepath):
for directory, _, filenames in os.walk(filepath):
dir_name = (os.path.split(directory)[-1])
for file in filenames:
all_labels.append(class_to_idx[dir_name])
with open(os.path.abspath(os.path.join(directory, file)), encoding="UTF-8") as f:
review = f.read()
review = clean_text(review)
all_reviews.append(review)
for word in review.split():
all_words.add(word.lower())
print("Reading data...")
read_files(r"data/imdb_data")
print("Preparing variables...")
for i, word in enumerate(sorted(all_words)):
word_to_idx[word] = i+1
idx_to_word[i] = word
word_to_idx["<pad>"] = 0
review_len = [len(i.split()) for i in all_reviews]
def pad_tensor(t):
t = torch.tensor(t)
padding = max(review_len) - t.size()[0]
t = torch.nn.functional.pad(t, (0, padding))
return t
class ImdbDataset(Dataset):
def __init__(self):
self.encoded_reviews = []
self.encoded_labels = all_labels
for review in all_reviews:
encoded = []
for word in review.split():
encoded.append(word_to_idx[word])
self.encoded_reviews.append(pad_tensor(encoded))
# decoded = []
# for idx in encoded:
# decoded.append(idx_to_word[idx])
# print(" ".join(decoded))
# break
def __len__(self):
return len(self.encoded_reviews)
def __getitem__(self, idx):
return self.encoded_reviews[idx], self.encoded_labels[idx]
imdb_dataset = ImdbDataset()
dataset_size = len(imdb_dataset)
n_words = len(all_words)+1
train_dataset, dev_dataset, test_dataset = random_split(imdb_dataset, [int(dataset_size*0.8), int(dataset_size*0.1), int(dataset_size*0.1)])
batch_size = 100
train_loader = DataLoader(train_dataset.dataset, batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset.dataset, 1)
test_loader = DataLoader(test_dataset.dataset, 1)
"""MODEL"""
class ImdbReviewModel(nn.Module):
def __init__(self, vocab_size, hidden_size):
super(ImdbReviewModel, self).__init__()
self.embedding_dims = 150
self.embed = nn.Embedding(vocab_size, self.embedding_dims)
self.gru = nn.GRU(self.embedding_dims, hidden_size)
self.fc1 = nn.Linear(hidden_size, 50)
self.fc2 = nn.Linear(50, 1)
def forward(self, x, h):
x = self.embed(x)
x, h = self.gru(x, h)
x = self.fc1(x)
x = self.fc2(x)
y_hat = torch.sigmoid(x)
return y_hat, h
"""TRAINING"""
print("Training...")
hidden_size = 128
model = ImdbReviewModel(n_words, hidden_size)
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)
epochs = 100
all_losses = []
h0 = torch.zeros(1, batch_size, hidden_size)
for epoch in range(1, epochs+1):
current_loss = 0
current_avg_loss = 0
for data in train_loader:
x, y = data[0].to(device), data[1].to(device)
h = h0.to(device)
x = x.permute(1, 0)
optimizer.zero_grad()
y_hat, h = model(x, h)
y_hat = y_hat[-1].squeeze()
loss = criterion(y_hat, y.float())
loss.backward()
optimizer.step()
current_loss += loss.item()
current_avg_loss = current_loss / len(train_loader)
all_losses.append(current_avg_loss)
print('%d, progress:%d%%, loss: %.4f' % (epoch, epoch / epochs * 100, current_avg_loss))
plt.figure()
plt.plot(all_losses)
plt.show()
Is there anything I’m doing incorrectly?
I’m really stuck on this one and any help would be greatly appreciated!
/Edit:
Here is what the processed text looks like (text, text encoded to a list of idx of the words, padded tensor).
story of a man who has unnatural feelings for a pig starts out with a opening scene that is a terrific example of absurd comedy a formal orchestra audience is turned into an insane violent mob by the crazy chantings of its singers unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting even those from the era should be turned off the cryptic dialogue would make shakespeare seem easy to a third grader on a technical level its better than you might think with some good cinematography by future great vilmos zsigmond future stars sally kirkland and frederic forrest can be seen briefly
[78799, 58047, 19, 50174, 90676, 36570, 86899, 29093, 30842, 19, 62225, 78158, 59236, 91235, 19, 58693, 71822, 82318, 42313, 19, 82106, 27439, 58047, 313, 15833, 19, 31014, 58869, 5043, 42313, 85468, 41944, 2811, 41390, 88747, 53616, 11405, 82377, 18140, 13420, 58047, 42630, 75046, 86585, 42500, 78249, 313, 82377, 90692, 83336, 91235, 56915, 32909, 55868, 27235, 49997, 42500, 44044, 83922, 58058, 65652, 27208, 82921, 31770, 82377, 26693, 74369, 6762, 85468, 58058, 82377, 18711, 21542, 91809, 49962, 73609, 72796, 24824, 83651, 19, 82779, 34508, 58451, 19, 81683, 47441, 42630, 7743, 82289, 92678, 52787, 82737, 91235, 76509, 34123, 14578, 11405, 32130, 34765, 88679, 93265, 32130, 78118, 71004, 45179, 2908, 31468, 31072, 11881, 6762, 72813, 10206]
tensor([78799, 58047, 19, ..., 0, 0, 0])