I’m training a Simple RNN on this dataset: https://ufile.io/gf7xo. I put the link so you can try my code on your machine. I use Adam as optimizer. I tried to build the same model (same weight initialization also) both on Pytorch and Keras (TF as backend) but, unfortunately, Pytorch’s convergence is always slower than Keras’. If you plot the loss along the epochs, you will also see that Pytorch’s Adam is a bit unstable with this learning rate while Keras is not and it is not a negligible problem. These are the results from some trials after 200 epochs:
Pytorch:
3.9312e-04
9.4073e-04
4.9248e-04
3.9022e-04
Keras:
1.2597e-04
4.9654e-05
5.8871e-05
1.1851e-04
Pytorch code:
import torch
import torch.nn as nn
from torch.autograd import Variable
import numpy as np
import torch.backends.cudnn
torch.backends.cudnn.enabled = False
BATCH_SIZE = 1
INPUT_DIM = 1
OUTPUT_DIM = 1
DTYPE = np.float64
class Net(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim, hidden_layers):
super(Net, self).__init__()
self.input_dim = input_dim
self.hidden_dim = hidden_dim
self.output_dim = output_dim
self.hidden_layers = hidden_layers
self.rnn = nn.RNN(input_dim, hidden_dim, hidden_layers)
self.h2o = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
h_0 = Variable(torch.zeros(self.hidden_layers, BATCH_SIZE, self.hidden_dim))
if DTYPE == np.float32:
h_0 = h_0.float()
else:
h_0 = h_0.double()
output, h_t = self.rnn(x, h_0)
output = self.h2o(output)
return output
def weights_init(m):
if isinstance(m, nn.RNN):
nn.init.xavier_uniform(m.weight_ih_l0.data)
nn.init.orthogonal(m.weight_hh_l0.data)
nn.init.constant(m.bias_ih_l0.data, 0)
nn.init.constant(m.bias_hh_l0.data, 0)
if isinstance(m, nn.Linear):
nn.init.xavier_uniform(m.weight.data)
nn.init.constant(m.bias.data, 0)
data = np.loadtxt('data/mg17.csv', delimiter=',', dtype=DTYPE)
trX = torch.from_numpy(np.expand_dims(data[:4000, [0]], axis=1))
trY = torch.from_numpy(np.expand_dims(data[:4000, [1]], axis=1))
loss_fcn = nn.MSELoss()
model = Net(INPUT_DIM, 10, OUTPUT_DIM, 1)
if DTYPE == np.float32:
model = model.float()
else:
model = model.double()
model.apply(weights_init)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, betas=(0.9, 0.999), eps=2e-16, weight_decay=0)
for e in range(500):
model.train()
x = Variable(trX)
y = Variable(trY)
model.zero_grad()
output = model(x)
loss = loss_fcn(output, y)
loss.backward()
optimizer.step()
print("Epoch", e + 1, "TR:", loss.cpu().data.numpy()[0])
open .keras/keras.json, set floatx to float64 and epsilon to 2e-16
Keras:
import keras
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, SimpleRNN
DTYPE = np.float64
data = np.loadtxt('data/mg17.csv', delimiter=',', dtype=DTYPE)
X_data = np.expand_dims(data[:, [0]], axis=0)
Y_data = np.expand_dims(data[:, [1]], axis=0)
model = Sequential()
model.add(SimpleRNN(10, return_sequences=True, input_shape=(4000, 1)))
model.add(Dense(1, activation='linear'))
optimizer = keras.optimizers.Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=2e-16, decay=0)
model.compile(loss='mean_squared_error',
optimizer=optimizer)
model.fit(X_data[:, :4000, :], Y_data[:, :4000, :], batch_size=1, epochs=500, verbose=2, shuffle=False)
UPDATE: The situation is the same with RMSProp. Nevertheless, the issue does not appear either with SGD or with LSTM/GRU.