Large performance gap between PyTorch and Keras for IMDB sentiment analysis model

I tried to compare a simple IMDB sentiment analysis implementation between PyTorch and Keras and found that the two give quite different test accuracy results:

Test Accuracy:

  • PyTorch v1.9.1: ~66%
  • Keras v2.4.1: ~77%

I would have expected to get approximately similar results.

The model consists of an Embedding layer, followed by a Flatten and a linear projection to a single output score. The two implementations are given below.

Any ideas if I am doing something wrong, or why there is such a large gap?

Here is a Colab notebook

PyTorch Model

import sys

import numpy as np
import torch
import torch.nn.functional as F
from tensorflow.keras import preprocessing
from tensorflow.keras.datasets import imdb
from torch import nn, optim
from torch.utils.data import DataLoader
from tqdm import tqdm

batch_size = 512
learning_rate = 0.001
maxlen = 20
vocab_size = 10000
embedding_dim = 16
output_dim = 1
n_epochs = 10
padding_idx = 0

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

x_train = preprocessing.sequence.pad_sequences(train_data, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(test_data, maxlen=maxlen)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

train_data = list(zip(x_train, y_train))
test_data = list(zip(x_test, y_test))

train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_data, batch_size=batch_size, shuffle=True)


class CBOWFlattenNet(nn.Module):
    def __init__(self, vocab_size, embedding_dim, output_dim, padding_idx, maxlen):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.flatten = nn.Flatten(start_dim=-2, end_dim=-1)
        self.fc = nn.Linear(embedding_dim * maxlen, output_dim)

    def forward(self, batch):
        embedded = self.embedding(batch)
        flattened = self.flatten(embedded)
        score = self.fc(flattened)
        score = score.squeeze(dim=1)
        score = F.sigmoid(score)
        return score


model = CBOWFlattenNet(vocab_size, embedding_dim, output_dim, padding_idx, maxlen)

loss_function = nn.BCELoss()
optimizer = optim.RMSprop(
    model.parameters(),
    lr=learning_rate,
    momentum=0.0,
    alpha=0.9,
    eps=1e-7,
    centered=False,
    weight_decay=0.0,
)


def get_accuracy_bce(prediction, label):
    batch_size = prediction.shape[0]
    predicted_classes = prediction > 0.5
    correct_predictions = (predicted_classes == label).sum()
    accuracy = correct_predictions / batch_size
    return accuracy


for epoch in range(n_epochs):
    epoch_train_losses = []
    epoch_train_accs = []
    epoch_val_losses = []
    epoch_val_accs = []

    model.train()

    for sentences, labels in tqdm(train_dataloader, desc='training...', file=sys.stdout):
        model.zero_grad()

        score = model(sentences)

        loss = loss_function(score, labels)
        loss.backward()
        optimizer.step()

        acc = get_accuracy_bce(score, labels)

        epoch_train_losses.append(loss.item())
        epoch_train_accs.append(acc.item())

    model.eval()

    with torch.no_grad():
        for sentences, labels in tqdm(test_dataloader, desc='evaluating...', file=sys.stdout):
            score = model(sentences)

            loss = loss_function(score, labels)
            acc = get_accuracy_bce(score, labels)

            epoch_val_losses.append(loss.item())
            epoch_val_accs.append(acc.item())

    epoch_train_loss = np.mean(epoch_train_losses)
    epoch_train_acc = np.mean(epoch_train_accs)
    epoch_val_loss = np.mean(epoch_val_losses)
    epoch_val_acc = np.mean(epoch_val_accs)

    print(f'epoch: {epoch+1}/{n_epochs} train_loss: {epoch_train_loss:.4f} train_acc: {epoch_train_acc:.4f} val_loss: {epoch_val_loss:.4f} val_acc: {epoch_val_acc:.4f}')

Keras Model


import numpy as np
import tensorflow as tf
from tensorflow.keras import models, layers, preprocessing
from tensorflow.keras.datasets import imdb

batch_size = 512
learning_rate = 0.001
maxlen = 20
vocab_size = 10000
embedding_dim = 16
output_dim = 1
n_epochs = 10

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=vocab_size)

x_train = preprocessing.sequence.pad_sequences(train_data, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(test_data, maxlen=maxlen)
y_train = np.asarray(train_labels).astype('float32')
y_test = np.asarray(test_labels).astype('float32')

model = models.Sequential()
model.add(layers.Embedding(vocab_size, embedding_dim, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(output_dim, activation='sigmoid'))

opt = tf.keras.optimizers.RMSprop(
    learning_rate=learning_rate,
    rho=0.9,
    momentum=0.0,
    epsilon=1e-7,
    centered=False,
)

model.compile(optimizer=opt, loss='binary_crossentropy', metrics=['acc'])

history = model.fit(x_train, y_train, epochs=n_epochs, batch_size=batch_size, shuffle=True)
results = model.evaluate(x_test, y_test)
print(results)

Following this, a custom initialization results in a test accuracy of 76%:

nn.init.uniform_(self.embedding.weight, -0.05, 0.05)
nn.init.xavier_uniform_(self.fc.weight, 1.0)
nn.init.zeros_(self.fc.bias)
1 Like

For the record: Keras pad_sequences truncates the start of the sequences (truncating=pre) by default. For IMDB sentiment analysis this is important to achieve good test accuracy with short maxlen. In other words: it seems the most informative tokens are at the end of the sequences.