Problems with PyTorch MLP when training the MNIST dataset retrieved from Keras

I have finished a PyTorch MLP model for the MNIST dataset, but got two different results: 0.90+ accuracy when using MNIST dataset from PyTorch, but ~0.10 accuracy when using MNIST dataset from Keras. Below is my code with dependency: PyTorch 0.3.0.post4, keras 2.1.3, tensorflow backend 1.4.1 gpu version.

# -*-coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import numpy as np
import torch as pt
import torchvision as ptv
from keras.datasets import mnist
from torch.nn import functional as F
from torch.utils.data import Dataset, DataLoader

train_set = ptv.datasets.MNIST("./data/mnist/train", train=True, transform=ptv.transforms.ToTensor(), download=True)
test_set = ptv.datasets.MNIST("./data/mnist/test", train=False, transform=ptv.transforms.ToTensor(), download=True)

train_dataset = DataLoader(train_set, batch_size=100, shuffle=True)
test_dataset = DataLoader(test_set, batch_size=10000, shuffle=True)


class MLP(pt.nn.Module):

    def __init__(self):
        super(MLP, self).__init__()
        self.fc1 = pt.nn.Linear(784, 512)
        self.fc2 = pt.nn.Linear(512, 128)
        self.fc3 = pt.nn.Linear(128, 10)
        self.use_gpu = True

    def forward(self, din):
        din = din.view(-1, 28 * 28)
        dout = F.relu(self.fc1(din))
        dout = F.relu(self.fc2(dout))
        # return F.softmax(self.fc3(dout))
        return self.fc3(dout)

model = MLP().cuda()
print(model)

# loss func and optim
optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()


def evaluate_acc(pred, label):
    pred = pred.cpu().data.numpy()
    label = label.cpu().data.numpy()
    test_np = (np.argmax(pred, 1) == label)
    test_np = np.float32(test_np)
    return np.mean(test_np)


def evaluate_loader(loader):
    print("evaluating ...")
    accurarcy_list = []
    for i, (inputs, labels) in enumerate(loader):
        inputs = pt.autograd.Variable(inputs).cuda()
        labels = pt.autograd.Variable(labels).cuda()
        outputs = model(inputs)
        accurarcy_list.append(evaluate_acc(outputs, labels))
    print(sum(accurarcy_list) / len(accurarcy_list))


def training(d, epochs):
    for x in range(epochs):
        for i, data in enumerate(d):

            optimizer.zero_grad()

            (inputs, labels) = data
            inputs = pt.autograd.Variable(inputs).cuda()
            labels = pt.autograd.Variable(labels).cuda()

            outputs = model(inputs)

            loss = criterion(outputs, labels)
            loss.backward()

            optimizer.step()

            if i % 200 == 0:
                print(i, ":", evaluate_acc(outputs, labels))

training(train_dataset, 4)
evaluate_loader(test_dataset)

print("###########################################################")


def load_mnist():
    (x, y), (x_test, y_test) = mnist.load_data()
    x = x.reshape((-1, 1, 28, 28)).astype(np.float32)
    x_test = x_test.reshape((-1, 1, 28, 28)).astype(np.float32)
    y = y.astype(np.int64)
    y_test = y_test.astype(np.int64)
    print("x.shape", x.shape, "y.shape", y.shape,
          "\nx_test.shape", x_test.shape, "y_test.shape", y_test.shape,
          )

    return x, y, x_test, y_test


class TMPDataset(Dataset):

    def __init__(self, a, b):
        self.x = a
        self.y = b

    def __getitem__(self, item):
        return self.x[item], self.y[item]

    def __len__(self):
        return len(self.y)


x_train, y_train, x_test, y_test = load_mnist()
test_loader = DataLoader(TMPDataset(x_test, y_test), num_workers=1, batch_size=10000)
train_loader = DataLoader(TMPDataset(x_train, y_train), shuffle=True, batch_size=100)

evaluate_loader(test_loader)
evaluate_loader(train_loader)

model = MLP().cuda()
print(model)

optimizer = pt.optim.SGD(model.parameters(), lr=1)
criterion = pt.nn.CrossEntropyLoss().cuda()

training(train_loader, 4)
evaluate_loader(test_loader)
evaluate_loader(train_loader)

I had check several samples from Keras MNIST dataset and found no error. I am wondering what is wrong with the datasets?

The keras dataset is unnormalized, i.e. pixel values are in [0, 255].
ToTensor() normalizes the data to be in the range [0, 1]. This often helps the model to learn.

As you can see, normalizing indeed helps a lot! :wink:

Just divide the input by 255. in load_mnist and your model also learns with the keras data.

def load_mnist():
    (x, y), (x_test, y_test) = mnist.load_data()
    x = x.reshape((-1, 1, 28, 28)).astype(np.float32)
    x_test = x_test.reshape((-1, 1, 28, 28)).astype(np.float32)
    y = y.astype(np.int64)
    y_test = y_test.astype(np.int64)
    x = x / 255.
    x_test = x_test / 255.
    
    print("x.shape", x.shape, "y.shape", y.shape,
          "\nx_test.shape", x_test.shape, "y_test.shape", y_test.shape,
          )

    return x, y, x_test, y_test

It works. Thanks a lot!!:+1::+1: