LSTM results having no variance compared to IRNN

I am trying to replicate the paper [[1504.00941] A Simple Way to Initialize Recurrent Networks of Rectified Linear Units](A Simple Way to Initialize Recurrent Networks of Rectified Linear Units), the addition problem especially with T = 150, but I am finding that LSTMs seem to jump from baseline to perfection with almost no noise Figure_1. I think there might be a problem with how I implemented LSTM unit, but I cannot seem to find any problem.

For both architectures I am using 25 hidden units and for LSTM I am using a learning rate of 0.1 and for the IRNN 0.005. The remaining are set by the default by the argparser in the main.

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(LSTM, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.output_weights = nn.Linear(hidden_size, output_size)
        self.init_bias()

    def forward(self, inp):
        _, hnn = self.rnn(inp)
        out = self.output_weights(hnn[0][0])
        return out

    def init_bias(self, value=1):
        # initialize the recurrent bias to value
        # if look the documentation you can find two paramenters
        # for the bias in the recurrent layer
        for names in self.rnn._all_weights:
            for name in filter(lambda n: "bias" in n, names):
                bias = getattr(self.rnn, name)
                n = bias.size(0)
                start, end = n // 4, n // 2
                bias.data[start:end].fill_(value / 2)


# Model taken from  arXiv:1504.00941v2
class IRNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(IRNN, self).__init__()
        self.hidden_size = hidden_size
        self.rnn = nn.RNN(input_size, hidden_size,
                          nonlinearity='relu', batch_first=True, bias=True)
        self.output_weights = nn.Linear(hidden_size, output_size)

        # Parameters initialization
        self.rnn.state_dict()['weight_hh_l0'].copy_(th.eye(hidden_size))
        self.rnn.bias_ih_l0.data.fill_(0)
        self.rnn.bias_hh_l0.data.fill_(0)
        self.rnn.state_dict()['weight_ih_l0'].copy_(
            th.randn(hidden_size, input_size) / hidden_size)

    def forward(self, inp):
        _, hnn = self.rnn(inp)
        out = self.output_weights(hnn[0])
        return out

Maybe pytorch does something that i don’t know in the nn.LSTM class.
I think the rest is fairly straightforword and it is the same for both architectures

my main is

import matplotlib.pyplot as plt
import torch as th
import torch.nn as nn
from benchmark_problems import Addition_Dataset
from models import *
from train import train
import argparse
import numpy as np

parser = argparse.ArgumentParser(
    description='PyTorch Addition problem with LSTM')
parser.add_argument('--n_hidden', type=int, default=100,
                    help='number of hidden units')
parser.add_argument('--clip', type=int, default=10,
                    help='gradient clipping')
parser.add_argument('--lr', type=float, default=.01,
                    help='learning rate')
parser.add_argument('--n_epochs', type=int, default=200,
                    help='number of epochs')
parser.add_argument('--cuda', action='store_true',
                    help='use CUDA')
parser.add_argument('--N_test', type=int, default=1000,
                    help='number of test datapoints')
parser.add_argument('--N_train', type=int, default=100000,
                    help='number of training datapoints')
parser.add_argument('--T', type=int, default=150,
                    help='sequence length')
parser.add_argument('--seed', type=int, default=100,
                    help='random seed')
parser.add_argument('--batch_test', type=int, default=1000,
                    help='mini batch length for test set')
parser.add_argument('--batch_train', type=int, default=16,
                    help='mini batch length for train set')
parser.add_argument('--verbose', action='store_true',
                    help='print loss every epoch')
parser.add_argument('--model', type=str, default='LSTM',
                    help='choose between LSTM, np_RNN')
parser.add_argument('--path', type=str, default='model_state/state_epoch',
                    help='path where to save model')
parser.add_argument('--load_path', type=str, default='',
                    help='path where to load the model')
parser.add_argument('--save_fig', type=str, default='',
                    help='path where to save the figure')

args = parser.parse_args()


th.manual_seed(args.seed)
np.random.seed(args.seed)

if th.cuda.is_available():
    if not args.cuda:
        print("WARNING: You have a CUDA device, so you should probably run with --cuda")
    else:
        th.cuda.manual_seed(args.seed)
        th.backends.cudnn.benchmark = True


# Prepare the data
dtype = th.FloatTensor
add_problem = Addition_Dataset(args.T, dtype)
train_loader = add_problem.get_loader(args.N_train, args.batch_train)
test_loader = add_problem.get_loader(args.N_test, args.batch_test)

# Initialize the model
# LSTM taken from models.py
if args.model == 'LSTM':
    model = LSTM(2, args.n_hidden, 1)
elif args.model == 'np_RNN':
    model = np_RNN(2, args.n_hidden, 1)
elif args.model == 'IRNN':
    model = IRNN(2, args.n_hidden, 1)

if args.cuda:
    model.cuda()
if args.load_path:
    model.load_state_dict(th.load(args.load_path))


criterion = nn.MSELoss()

optimizer = th.optim.SGD(model.parameters(), lr=args.lr)
model, losses = train(model, criterion, optimizer, train_loader, test_loader,
                      clip=args.clip, n_epochs=args.n_epochs, use_cuda=args.cuda, verbose=args.verbose, PATH=args.path)
np.save(args.save_fig + 'losses.npy', losses)
plt.plot(losses)
plt.ylim([0, 1])
plt.ylabel('MSE')
plt.xlabel('Epoch')
plt.savefig(args.save_fig + 'losses.png')

and my training procedure is

train.py

import torch as th
import tqdm
from torch.autograd import Variable as V
import numpy as np


def train(model, criterion, optimizer, train_loader, test_loader, clip=10, n_epochs=1, use_cuda=False, verbose=True, mnist=False, PATH='model_state/state_epoch'):

    losses = np.zeros(n_epochs)
    i = 0
    if mnist:
        correct = np.zeros(n_epochs)
    for epoch in tqdm.tqdm(range(n_epochs)):
        for x_batch, y_batch in train_loader:
            if use_cuda:
                x_batch = V(x_batch).cuda()
                y_batch = V(y_batch).cuda()
            else:
                x_batch = V(x_batch)
                y_batch = V(y_batch)
            optimizer.zero_grad()
            output = model.forward(x_batch)
            loss = criterion(output, y_batch)
            loss.backward()
            th.nn.utils.clip_grad_norm(model.parameters(), clip)
            optimizer.step()

        loss_test = 0
        for x_batch, y_batch in test_loader:
            if use_cuda:
                x_batch = V(x_batch).cuda()
                y_batch = V(y_batch).cuda()
            else:
                x_batch = V(x_batch)
                y_batch = V(y_batch)
            output = model.forward(x_batch)
            loss_test += criterion(output, y_batch)
            if mnist:
                correct[i] += (th.max(output.data, 1)[1] == y_batch.data).sum()
        losses[i] = loss_test / len(test_loader)
        if verbose:
            print('Loss at epoch', epoch + 1, ':', losses[i])
            if mnist:
                print('Accuracy at epoch', epoch + 1, ':', correct[i])
        i += 1
        th.save(model.state_dict(), PATH + str(i) + '.pt')
    if not mnist:
        return model, losses
    else:
        return model, losses, correct / len(test_loader) / test_loader.batch_size