A Simple LSTM stuck into label flipping

Hi All,

I am new to machine learning and pytorch. To get accustomed to the code , I am training a simple LSTM on the amazon review data from kaggle for sentiment analysis (target label 1 or 0). The input is character level sequence to the LSTM.

My training is stuck on label flipping, in each batch either all the predictions are 0 or all the predictions are 1.
I tried 1 batch overfitting with a batch of 10, but the loss is not getting down to 0.0 or close to 0.0

I tried in device - cpu , mps in my mac, cuda in kaggle. All of them have the same issue.
I had a simple MLP module and it is able to overfit a batch of 64 from 6th step.
To avoid vanishing gradient in the BPTT I filter the inputs/target where input length < 256 chars.

Can you please suggest what I am doing wrong and how to fix it ?

Here is my kaggle notebook - amazon_review_2 | Kaggle

Below is my code,

import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import torch
import pandas as pd


dataset_dir = '/Users/subhojit/datasets/amazon_review_polarity_csv'
df_train = pd.read_csv(dataset_dir + '/train.csv')
df_test = pd.read_csv(dataset_dir + '/test.csv')

review = df_train.iloc[:, 2].to_numpy()
test_review = df_test.iloc[:, 2].to_numpy()
all_review = np.concatenate((review, test_review))
c = sorted(list(set(''.join(all_review))))
chars = c + ['<SOS>', '<EOS>', '<PAD>']
stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
vocab_size = len(chars)


df_filtered = df_train[df_train.iloc[:, 2].str.len() < 256]
x_filtered = df_filtered.iloc[:, 2].to_numpy()
y_filtered = df_filtered.iloc[:, 0].to_numpy()

n = int(0.9*len(x_filtered))
xtrain = x_filtered[:n]
ytrain = y_filtered[:n]
xval = x_filtered[n:]
yval = y_filtered[n:]

xtest = df_test.iloc[:, 2].to_numpy()
ytest = df_test.iloc[:, 0].to_numpy()

stoi = {ch:i for i, ch in enumerate(chars)}
itos = {i:ch for i, ch in enumerate(chars)}
encode = lambda s: [stoi[xi] for xi in s]
decode = lambda l: ''.join([itos[li] for li in l])

if torch.backends.mps.is_available():
    device = "mps"
elif torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

def pad_sequences(sequences):
    pad_index = stoi['<PAD>']
    max_len = np.max([len(s) for s in sequences])
    padded_seq = np.full((len(sequences), max_len), pad_index, dtype=np.int32)
    for i, seq in enumerate(sequences):
        padded_seq[i, :len(seq)] = seq
    return padded_seq


def get_batch(batch_size, split='train'):
    data = xtrain if split == 'train' else xval
    target = ytrain if split == 'train' else yval
    idx = np.random.randint(0, len(data), (batch_size,))
    x_sample = [encode(s) for s in data[idx]]
    y_sample = target[idx]
    xpadded = pad_sequences(x_sample)
    xb, yb = xpadded, y_sample
    yb = torch.from_numpy(yb)
    yb = yb - 1
    # yb = torch.nn.functional.one_hot(yb - 1, num_classes=2)
    xb = torch.from_numpy(xb)
    x = xb.to(device, dtype=torch.long)
    y = yb.to(device, dtype=torch.long)
    return x, y


class LSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_size, output_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(
            input_size=embedding_dim,
            hidden_size=hidden_size,
            batch_first=True,
            bidirectional=True
        )
        self.fc = nn.Linear(hidden_size * 2, output_size)
        nn.init.xavier_uniform_(self.fc.weight)
        nn.init.zeros_(self.fc.bias)

    def forward(self, x):
        x = self.embedding(x)            # (B, T, D)
        output, (hn, cn) = self.lstm(x)  # output: (B, T, H)
        last_hidden = output[:, -1, :]   # (B, H)  <-- last time step
        logits = self.fc(last_hidden)    # (B, C)
        return logits


embedding_dim = 32
hidden_size = 256
output_size = 2
batch_size = 64
seq_len = 10
learning_rate = 1e-2
max_iter = 5000
eval_interval = 500

model = LSTMClassifier(vocab_size, hidden_size, output_size, embedding_dim)
# model = MLPClassifier(vocab_size, embedding_dim, hidden_size, output_size, xb.shape[1])
model = model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-2)

for step in range(max_iter):
    xb, yb = get_batch(batch_size)
    logits = model(xb)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    with torch.no_grad():
        logits = model(xb)
        probs = torch.softmax(logits, dim=1)
        print("Confidence range:", probs.max(dim=1).values[:10])
        preds = torch.argmax(logits, dim=1)
        print("Preds: ", preds.tolist())
        print("Targets: ", yb.tolist())

    if step % eval_interval == 0:
        print(f"step {step}: train loss {loss:.4f}")

Is the loss going down? Can your overfit on a very small dataset, e.g., just a single batch? 1e-2 feel like a large value, but that’s more a gut feeling.

While probably not solving your problem, you don’t want to use

last_hidden = output[:, -1, :]

If you have bidirectional=True

output[:, -1, :] gives you the last hidden state of the forward pass but the first hidden state of the backward pass. The last hidden state of the backward pass. It’s typically much more convenient to use hn instead of output since it contains both respective last hidden states.

Hi Chris,

Thanks for your response.

Got your point on bidirectional=True and not using output[:, -1, :] in that case. It was a mistake from my side. Rectified it.

Yes I tried with 1-batch overfitting and it was not able to overfit. After the post I did some more debugging and observe that the gradient norm is getting to 0.0 after few steps.

It seems that is mainly because of the padded tokens that I was adding and padded token index not being 0. As I was doing variable length of input sequence the gradient was killed during BPTT by the padded tokens.

As a solution, I took the actual length of each input sequence and passing the length in the LSTM forward pass to let pytorch know that I don’t want to perform forward and backward pass on the padded tokens. Along with that taking the hidden layer output from the end of the input sequence before the padding and passing it through the fully connected layer to produce the logits. After this I was able to overfit on 1-batch with decent size (64) and loss is getting to 0.0
I am also able to train the model and getting a good accuracy on the validation set.

Tried in Kaggle with cuda , and in my mac with mps with a smaller dataset and it worked. Today I will try on the Amazon reviews dataset.

kaggle notebook - SMS_Spam_Detection | Kaggle

Below is the 1-batch overfit code,

import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import train_test_split

dataset_dir = '/Users/subhojit/datasets/sms_spam_collection'
df = pd.read_csv(dataset_dir + "/SMSSpamCollection", sep='\t', header=None, names=['label', 'text'])

df['label'] = df['label'].map({'ham': 0, 'spam': 1})
texts = df['text'].tolist()
labels = df['label'].tolist()

chars = sorted(set(''.join(texts)))
stoi = {ch: i + 1 for i, ch in enumerate(chars)}
stoi['<PAD>'] = 0
vocab_size = len(stoi)
encode = lambda s: [stoi[c] for c in s if c in stoi]

xtrain, xval, ytrain, yval = train_test_split(texts, labels, test_size=0.2, random_state=42)

def pad_sequences(sequences, max_len=256):
    padded = torch.zeros(len(sequences), max_len, dtype=torch.long)
    lengths = torch.zeros(len(sequences), dtype=torch.long)
    for i, seq in enumerate(sequences):
        seq = seq[:max_len]
        padded[i, :len(seq)] = torch.tensor(seq)
        lengths[i] = len(seq)
    return padded, lengths

def get_batch(batch_size, split='train'):
    x = xtrain if split == 'train' else xval
    y = ytrain if split == 'train' else yval
    idx = torch.randint(0, len(x), (batch_size,))
    xb = [encode(x[i]) for i in idx]
    yb = [y[i] for i in idx]
    xb, lengths = pad_sequences(xb)
    return xb, torch.tensor(yb, dtype=torch.long), lengths

class ManualLSTMClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=0)
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x, lengths):
        x_embed = self.embedding(x)
        out, _ = self.lstm(x_embed)

        batch_size = x.size(0)
        last_hidden = torch.zeros(batch_size, out.size(2), device=x.device)
        for i in range(batch_size):
            last_hidden[i] = out[i, lengths[i] - 1]

        return self.fc(last_hidden)

device = 'mps'
model = ManualLSTMClassifier(vocab_size, embed_dim=32, hidden_dim=64, output_dim=2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

xb, yb, lengths = get_batch(32)
xb, yb, lengths = xb.to(device), yb.to(device), lengths.to(device)

for step in range(1000):
    logits = model(xb, lengths)
    loss = F.cross_entropy(logits, yb)
    optimizer.zero_grad()
    loss.backward()
    torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
    optimizer.step()

    if step % 100 == 0:
        preds = torch.argmax(logits, dim=1)
        print(f"Step {step}, Loss: {loss.item():.4f}")
        print("Preds:", preds.tolist())
        print("Targets:", yb.tolist())
        print()

All the padding and extra steps to ignore the passing for calculating the loss comes almost for free using methods provided by PyTorch. You may want to have a look at PackedSequence, pack_padded_sequence and pad_sequence for that. It will make your code smaller, cleaner, and less error-prone

I also have to Jupyter notebooks that cover those methods in detail here, and go through a practical example here. Maybe useful.

Thanks for the pointers, it is an interesting read.

I came across the the pytorch module pack_padded_sequence , I wanted to write the part by myself.