Reproducing classical LSTM results

I’m trying to reproduce a classical result of Hochreiter and Schmidhuber namely the addition problem (https://papers.nips.cc/paper/1215-lstm-can-solve-hard-long-time-lag-problems.pdf). This is cited in many modern papers that deal with training RNN/LSTM. My results just can’t seem to match what’s described in the paper. They claim that for sequences with T=100, they stop training after processing 74,000 sequences which would require the average training error to be below 0.01 and have all of the last 2,000 processed sequences be classified correctly (error below 0.04). However, after processing 100,000 sequences, I’m getting that the average training error converges to slightly above 0.04 and I can never classify more than 1,400 (of the last 2,000) sequences correctly (see attached plots). My code is

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

import matplotlib.pyplot as plt
import numpy as np

#Generate Random Sequence
def gen_sequence(T):
    seq_length = int(torch.randint(T, T + int(T/10) + 1, (1,)).item())

    x = torch.zeros(seq_length, 2)
    x[:,0].uniform_(-1.0, 1.0)
    x[0,1] = -1.0
    x[seq_length - 1, 1] = -1.0

    first_mark = int(torch.randint(0, 10, (1,)).item())
    x[first_mark,1] = 1.0

    if first_mark == 0:
        x[0,0] = 0.0

    second_mark = int(torch.randint(0, int(T/2), (1,)).item())
    while second_mark == first_mark:
        second_mark = int(torch.randint(0, int(T/2), (1,)).item())

    x[second_mark,1] = 1.0

    return x.view(1,seq_length,2), 0.5 + (x[first_mark,0] + x[second_mark,0])/4.0

#LSTM architecture
class MyLSTM(nn.Module):
    def __init__(self):
        super(MyLSTM, self).__init__()

        self.lstm = torch.nn.LSTM(input_size=2, hidden_size=2, num_layers=3, batch_first=True)
        self.fc = nn.Linear(2,1)

    def forward(self, x):
        x = self.lstm(x)[0][0,-1,:]
        x = x.view(2,)

        return F.sigmoid(self.fc(x))

        
T = 100
ep = 100000

model = MyLSTM().cuda()
optimizer = optim.SGD(model.parameters(), lr=0.5)

running_mean = 0.0
means = np.zeros((ep,))
classifications = np.zeros((ep,))

classified = []

for j in range(ep):
    x, y = gen_sequence(T)
    x, y = x.cuda(), y.cuda()

    optimizer.zero_grad()

    loss = torch.pow(model(x) - y, 2)
    loss.backward()

    my_loss = loss.cpu().item()

    optimizer.step()

    running_mean = (j*running_mean + my_loss)/(j+1)
    means[j] = running_mean

    if len(classified) == 2000:
        classified.pop(0)

    if my_loss < 0.04:
        classified.append(1)
    else:
        classified.append(0)

    classifications[j] = sum(classified)

t = np.arange(1, ep+1)

plt.figure(1)
plt.plot(t, means)
plt.xlabel('Processed Sequences')
plt.ylabel('Running Mean of Training Error')

plt.figure(2)
plt.plot(t, classifications)
plt.xlabel('Processed Sequences')
plt.ylabel('Correctly Classified Sequences (from last 2000)')

plt.show()

I think I’m generating the sequences correctly. Is my LSTM architecture wrong? Maybe it’s the initialization, but they claim it shouldn’t matter much? Does anyone have a source where this is implemented, so I can at least check against it (doesn’t have to be in PyTorch)? Thanks.
Figure_1