I’ve tried:
- One-hot-encoding, didn’t converge
- Tokenizing words and training one token at a time, didn’t converge
- Training full posts at a time, didn’t converge
- Improving the tokenization mechanism, to start tokenizing from the most common word to the least common, didn’t converge
- Reducing the dataset to posts of less than 100 words, didn’t converge
- Use my own RNN network, didn’t converge
- Use nn.RNN, didn’t converge
- Use nn.LSTM, didn’t converge
- Added more LSTM layers, didn’t converge
- Added more FC layers before the output, didn’t converge
- Added tanh in between the FC layers, didn’t converge
- Used Adam, didn’t converge
- Used SGD with momentum, didn’t converge
- Used batched training, didn’t converge
- Used two outputs with CrossEntropyLoss, didn’t converge
- Used one output with BCEWithLogitsLoss, didn’t converge
- Tried multiple mixes of the above, didn’t converge
I don’t know what to do anymore. Coming from a development background, I’m not used to this alchemical approach to building networks…
This is my code:
# install packages
!pip install -U portalocker>=2.0.0
# fetch the data
import torch
from torchtext.datasets import IMDB
from torch.utils.data import DataLoader
train_dp = IMDB(split="train")
test_dp = IMDB(split="test")
train_iter = iter(train_dp)
test_iter = iter(test_dp)
# preprocessing functions
import unicodedata
import re
char_list = [
'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z',
'0', '1', '2', '3', '4', '5', '6', '7', '8', '9'
]
separators = [
'!', '"', '%', '&', "'", '(', ')', '*', '+', ',', '-',
'.', '/', ':', ';', '?', '@'
'_'
]
spaces = [
' '
]
eye_matrix = torch.eye(len(char_list))
# Map characters to indices in your character set
char_to_index = {char: index for index, char in enumerate(char_list)}
def unicode_to_ascii(s):
return unicodedata.normalize('NFD', s).encode('ascii', 'ignore').decode('ascii')
def filter_string(input_string, char_list = char_list + separators + spaces):
return ''.join([char for char in input_string if char in char_list])
def clean_string(input_string):
return filter_string(unicode_to_ascii(input_string.lower()))
# Function to convert string to a matrix of one-hot vectors
def string_to_matrix(input_string, char_to_index, identity_matrix):
indices = [char_to_index[char] for char in input_string if char in char_to_index]
return identity_matrix[indices]
def one_hot_encode(str:str):
return string_to_matrix(filter_string(unicode_to_ascii(str.lower()), char_list), char_to_index, eye_matrix)
def split_string(input_string, separators = separators, spaces = spaces):
# Escape separators for regex and combine them with spaces
escaped_separators = map(re.escape, separators)
escaped_spaces = map(re.escape, spaces)
pattern = f"({'|'.join(escaped_separators)}|{'|'.join(escaped_spaces)})"
# Split the string based on the pattern
split_list = re.split(pattern, input_string)
# Remove spaces, empty strings, and lowercase words
cleaned_list = [element.lower() for element in split_list if element not in spaces and element != '']
return cleaned_list
# load the data, preferrably only once
# this dataloader sucks
train_dataloader = DataLoader(dataset=train_dp, shuffle=True)
posts = []
pos_count = 0
neg_count = 0
for i, (sentiment, str) in enumerate(train_dataloader):
post = str[0]
posts.append((sentiment, post))
if sentiment.item() == 1:
neg_count += 1
else: pos_count += 1
# define the tokenizer class
class Tokenizer():
def __init__(self, size:int, padding_num:int = 0):
self.dict = {}
self.size = size
self.padding_num = padding_num
def add_word(self, word:str):
if word not in self.dict:
self.dict[word] = len(self.dict) + 1
def tokenize(self, words, size = 0, do_pad = False):
if size == 0: size = self.size
arr = []
for _, word in enumerate(words):
self.add_word(word)
arr.append(self.dict[word])
if not do_pad: return arr;
if len(arr) > size:
arr = arr[0:size]
else:
padding = [self.padding_num] * (size - len(arr))
arr.extend(padding)
return arr
# order posts by frequency and tokenize their words
from collections import Counter
word_counter = Counter()
for _, str in posts:
str = clean_string(str)
words = split_string(str)
word_counter.update(words)
tokenizer = Tokenizer(100, 0)
words = [item[0] for item in word_counter.most_common()]
tokenizer.tokenize(words)
# define LSTM network
import torch
import torch.nn as nn
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_features, out_features, num_layers=1):
super(LSTM, self).__init__()
self.hidden_size = hidden_features
self.num_layers = num_layers
# Embedding layer
self.embedding = nn.Embedding(vocab_size, embedding_dim)
# LSTM layer
self.lstm = nn.LSTM(embedding_dim, hidden_features, num_layers, batch_first=True)
# Output layer
self.fc = nn.Sequential(
nn.Linear(hidden_features, hidden_features // 2),
nn.Tanh(),
nn.Linear(hidden_features // 2, hidden_features // 4),
nn.Tanh(),
nn.Linear(hidden_features // 4, out_features)
)
def forward(self, input, hidden):
# Embedding layer
embedded = self.embedding(input)
# LSTM layer
output, (hidden, cell) = self.lstm(embedded, hidden)
# Take the output of the last time step
output = output[:, -1, :]
# Compute final output
output = self.fc(output)
return output, (hidden, cell)
def init_hidden(self, batch_size):
# Initialize hidden state and cell state with zeros
hidden_state = torch.zeros(self.num_layers, batch_size, self.hidden_size)
cell_state = torch.zeros(self.num_layers, batch_size, self.hidden_size)
return (hidden_state, cell_state)
# write the training loop
def batch_generator(array, batch_size):
for i in range(0, len(array), batch_size):
yield array[i:i + batch_size]
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = LSTM(vocab_size=(len(tokenizer.dict) + 1), embedding_dim=100, hidden_features=100, out_features=1, num_layers=2).to(device)
import math
import matplotlib.pyplot as plt
import random
if pos_count == 0 or neg_count == 0:
print("ERROR data loaded incorrectly")
losses = []
loss_fn = nn.BCEWithLogitsLoss()
optim = torch.optim.Adam(model.parameters(), lr=0.0001)
# optim = torch.optim.SGD(model.parameters(), lr=0.00001, momentum=0.9)
epochs = 30
tensor_size = 100
batch_size = 4
for epoch in range(epochs):
# filter by shorter posts
filtered_posts = [post for post in posts if len(post[1]) <= tensor_size*4]
# randomize the posts. The dataloader doesn't do it correctly
random.shuffle(filtered_posts)
model.train()
count = 0
for batch in batch_generator(filtered_posts, batch_size=batch_size):
# if the last batch has a smaller size, skip it
if len(batch) < batch_size: break
hidden = model.init_hidden(batch_size)
device_hidden = (hidden[0].to(device), hidden[1].to(device))
words_list = [split_string(clean_string(item[1])) for item in batch]
tokenized_words_list = [tokenizer.tokenize(words, tensor_size, True) for words in words_list]
vectors = torch.tensor(tokenized_words_list, device=device)
output, hidden = model(vectors, device_hidden)
targets = torch.tensor([0 if item[0].item() == 1 else 1 for item in batch], device=device, dtype=torch.float32)
loss = loss_fn(output, targets.view(-1, 1))
losses.append(loss.item())
optim.zero_grad()
loss.backward()
optim.step()
if math.isnan(loss.item()):
print(i)
break
if count % 50 == 0:
print(f"Epoch {epoch}.")
print(f"Analyzed {count*batch_size} posts.")
plt.figure()
plt.plot(losses)
plt.show()
count += 1