Simple BinaryClassification doesn't converge

Hi,

I’m wanting to train a model to do sentiment classification on the imdb dataset. I downloaded the dataset and am “manually” reading and cleaning it. I figured this was better practice (as in me practicing to do these things) than using the ready-to-go dataset from torchtext.

I’m padding the tensors to make them all of equal length.

Unfortunately my model doesn’t seem to converge. I’m training it with 25000 samples, for 10 epochs with a batch-size of 100.

Here is how I clean-up the reviews:

def clean_text(text):
    # throw everything but some chars
    output = [char for char in text.lower() if char in string.ascii_lowercase+string.digits+string.whitespace+"<- "]
    # additional cleanup
    output = re.sub("[-]+", " ", "".join(output).strip().replace("<br", ""))
    return output

Here’s my padding:

def pad_tensor(t):
    t = torch.tensor(t)
    padding = max(review_len) - t.size()[0]
    t = torch.nn.functional.pad(t, (0, padding))
    return t

my dataset:

class ImdbDataset(Dataset):
    def __init__(self):
        self.encoded_reviews = []
        self.encoded_labels = all_labels

        for review in all_reviews:
            encoded = []
            for word in review.split():
                encoded.append(word_to_idx[word])
            self.encoded_reviews.append(pad_tensor(encoded))

            # decoded = []
            # for idx in encoded:
            #    decoded.append(idx_to_word[idx])
            # print(" ".join(decoded))
            # break

    def __len__(self):
        return len(self.encoded_reviews)

    def __getitem__(self, idx):
        return self.encoded_reviews[idx], self.encoded_labels[idx]

my model:

class ImdbReviewModel(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(ImdbReviewModel, self).__init__()
        self.embedding_dims = 150
        self.embed = nn.Embedding(vocab_size, self.embedding_dims)
        self.gru = nn.GRU(self.embedding_dims, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 50)
        self.fc2 = nn.Linear(50, 1)

    def forward(self, x, h):
        x = self.embed(x)
        x, h = self.gru(x, h)
        x = self.fc1(x)
        x = self.fc2(x)
        y_hat = torch.sigmoid(x)

        return y_hat, h

training:

hidden_size = 128

model = ImdbReviewModel(n_words, hidden_size)
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

epochs = 100
all_losses = []

h0 = torch.zeros(1, batch_size, hidden_size)

for epoch in range(1, epochs+1):
    current_loss = 0
    current_avg_loss = 0
    for data in train_loader:

        x, y = data[0].to(device), data[1].to(device)
        h = h0.to(device)

        x = x.permute(1, 0)

        optimizer.zero_grad()

        y_hat, h = model(x, h)
        y_hat = y_hat[-1].squeeze()

        loss = criterion(y_hat, y.float())

        loss.backward()
        optimizer.step()
        current_loss += loss.item()

    current_avg_loss = current_loss / len(train_loader)
    all_losses.append(current_avg_loss)
    print('%d, progress:%d%%, loss: %.4f' % (epoch, epoch / epochs * 100, current_avg_loss))

For completeness sake, here’s the entire code:

import os
import re
import string
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
from torch.utils.data import DataLoader, Dataset, random_split


class_to_idx = {"pos": 1, "neg": 0}
n_classes = 2
all_words = set()
all_reviews = []
all_labels = []
word_to_idx = {}
idx_to_word = {}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
"""LOAD DATA"""


def clean_text(text):
    # throw everything but some chars
    output = [char for char in text.lower() if char in string.ascii_lowercase+string.digits+string.whitespace+"<- "]
    # additional cleanup
    output = re.sub("[-]+", " ", "".join(output).strip().replace("<br", ""))
    return output


def read_files(filepath):
    for directory, _, filenames in os.walk(filepath):
        dir_name = (os.path.split(directory)[-1])
        for file in filenames:
            all_labels.append(class_to_idx[dir_name])
            with open(os.path.abspath(os.path.join(directory, file)), encoding="UTF-8") as f:
                review = f.read()
                review = clean_text(review)
                all_reviews.append(review)
                for word in review.split():
                    all_words.add(word.lower())


print("Reading data...")
read_files(r"data/imdb_data")

print("Preparing variables...")
for i, word in enumerate(sorted(all_words)):
    word_to_idx[word] = i+1
    idx_to_word[i] = word
word_to_idx["<pad>"] = 0

review_len = [len(i.split()) for i in all_reviews]


def pad_tensor(t):
    t = torch.tensor(t)
    padding = max(review_len) - t.size()[0]
    t = torch.nn.functional.pad(t, (0, padding))
    return t


class ImdbDataset(Dataset):
    def __init__(self):
        self.encoded_reviews = []
        self.encoded_labels = all_labels

        for review in all_reviews:
            encoded = []
            for word in review.split():
                encoded.append(word_to_idx[word])
            self.encoded_reviews.append(pad_tensor(encoded))

            # decoded = []
            # for idx in encoded:
            #    decoded.append(idx_to_word[idx])
            # print(" ".join(decoded))
            # break

    def __len__(self):
        return len(self.encoded_reviews)

    def __getitem__(self, idx):
        return self.encoded_reviews[idx], self.encoded_labels[idx]


imdb_dataset = ImdbDataset()
dataset_size = len(imdb_dataset)

n_words = len(all_words)+1


train_dataset, dev_dataset, test_dataset = random_split(imdb_dataset, [int(dataset_size*0.8), int(dataset_size*0.1), int(dataset_size*0.1)])

batch_size = 100

train_loader = DataLoader(train_dataset.dataset, batch_size, shuffle=True)
dev_loader = DataLoader(dev_dataset.dataset, 1)
test_loader = DataLoader(test_dataset.dataset, 1)


"""MODEL"""


class ImdbReviewModel(nn.Module):
    def __init__(self, vocab_size, hidden_size):
        super(ImdbReviewModel, self).__init__()
        self.embedding_dims = 150
        self.embed = nn.Embedding(vocab_size, self.embedding_dims)
        self.gru = nn.GRU(self.embedding_dims, hidden_size)
        self.fc1 = nn.Linear(hidden_size, 50)
        self.fc2 = nn.Linear(50, 1)

    def forward(self, x, h):
        x = self.embed(x)
        x, h = self.gru(x, h)
        x = self.fc1(x)
        x = self.fc2(x)
        y_hat = torch.sigmoid(x)

        return y_hat, h


"""TRAINING"""


print("Training...")

hidden_size = 128

model = ImdbReviewModel(n_words, hidden_size)
model.to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0003)

epochs = 100
all_losses = []

h0 = torch.zeros(1, batch_size, hidden_size)

for epoch in range(1, epochs+1):
    current_loss = 0
    current_avg_loss = 0
    for data in train_loader:

        x, y = data[0].to(device), data[1].to(device)
        h = h0.to(device)

        x = x.permute(1, 0)

        optimizer.zero_grad()

        y_hat, h = model(x, h)
        y_hat = y_hat[-1].squeeze()

        loss = criterion(y_hat, y.float())

        loss.backward()
        optimizer.step()
        current_loss += loss.item()

    current_avg_loss = current_loss / len(train_loader)
    all_losses.append(current_avg_loss)
    print('%d, progress:%d%%, loss: %.4f' % (epoch, epoch / epochs * 100, current_avg_loss))

plt.figure()
plt.plot(all_losses)
plt.show()

Is there anything I’m doing incorrectly?
I’m really stuck on this one and any help would be greatly appreciated!

/Edit:
Here is what the processed text looks like (text, text encoded to a list of idx of the words, padded tensor).

story of a man who has unnatural feelings for a pig starts out with a opening scene that is a terrific example of absurd comedy a formal orchestra audience is turned into an insane violent mob by the crazy chantings of its singers unfortunately it stays absurd the whole time with no general narrative eventually making it just too off putting even those from the era should be turned off the cryptic dialogue would make shakespeare seem easy to a third grader on a technical level its better than you might think with some good cinematography by future great vilmos zsigmond future stars sally kirkland and frederic forrest can be seen briefly

[78799, 58047, 19, 50174, 90676, 36570, 86899, 29093, 30842, 19, 62225, 78158, 59236, 91235, 19, 58693, 71822, 82318, 42313, 19, 82106, 27439, 58047, 313, 15833, 19, 31014, 58869, 5043, 42313, 85468, 41944, 2811, 41390, 88747, 53616, 11405, 82377, 18140, 13420, 58047, 42630, 75046, 86585, 42500, 78249, 313, 82377, 90692, 83336, 91235, 56915, 32909, 55868, 27235, 49997, 42500, 44044, 83922, 58058, 65652, 27208, 82921, 31770, 82377, 26693, 74369, 6762, 85468, 58058, 82377, 18711, 21542, 91809, 49962, 73609, 72796, 24824, 83651, 19, 82779, 34508, 58451, 19, 81683, 47441, 42630, 7743, 82289, 92678, 52787, 82737, 91235, 76509, 34123, 14578, 11405, 32130, 34765, 88679, 93265, 32130, 78118, 71004, 45179, 2908, 31468, 31072, 11881, 6762, 72813, 10206]

tensor([78799, 58047, 19, ..., 0, 0, 0])

Okay, it seems to be training now - loss is decreasing.
I cut out all reviews >500 words, which decreases the padding significantly.