My custom LSTM cannot learn properly

Hello,

I’ve been trying to come up with a custom lstm in pytorch so that I can do further custom modification on it. Here I referenced the relevant work (lstms.pth/lstm.py at master · seba-1511/lstms.pth · GitHub) on how to layout the custom class for lstm. But when I try to run it to play on the toy dataset “Airline passenger prediction”, the loss seems to be stuck!
Any comment should be very helpful!
Thanks in advance!

class SlowLSTM(nn.Module):

"""
A pedagogic implementation of Hochreiter & Schmidhuber:
'Long-Short Term Memory'
http://www.bioinf.jku.at/publications/older/2604.pdf
"""

def __init__(self, input_size: int, hidden_size: int, bias: bool = True, dropout: float = 0.0):
    super(SlowLSTM, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.bias = bias
    self.dropout = dropout
    # input to hidden weights
    self.w_xi = Parameter(th.empty(hidden_size, input_size))
    self.w_xf = Parameter(th.empty(hidden_size, input_size))
    self.w_xo = Parameter(th.empty(hidden_size, input_size))
    self.w_xc = Parameter(th.empty(hidden_size, input_size))
    # hidden to hidden weights
    self.w_hi = Parameter(th.empty(hidden_size, hidden_size))
    self.w_hf = Parameter(th.empty(hidden_size, hidden_size))
    self.w_ho = Parameter(th.empty(hidden_size, hidden_size))
    self.w_hc = Parameter(th.empty(hidden_size, hidden_size))
    # bias terms
    self.b_i = th.empty(hidden_size).fill_(0)
    self.b_f = th.empty(hidden_size).fill_(0)
    self.b_o = th.empty(hidden_size).fill_(0)
    self.b_c = th.empty(hidden_size).fill_(0)

    # Wrap biases as parameters if desired, else as variables without gradients
    if bias:
        self.b_i = Parameter(self.b_i)
        self.b_f = Parameter(self.b_f)
        self.b_o = Parameter(self.b_o)
        self.b_c = Parameter(self.b_c)
    self.reset_parameters()

    self.fc = nn.Linear(self.hidden_size, 1)


def reset_parameters(self):
    std = 1.0 / math.sqrt(self.hidden_size)
    for w in self.parameters():
        w.data.uniform_(-std, std)

def forward(self, x):#: th.Tensor, hidden: Tuple[th.Tensor, th.Tensor]) -> Tuple[th.Tensor, Tuple[th.Tensor, th.Tensor]]:
    h, c = (V(th.zeros(1, hidden_size)), V(th.zeros(1, hidden_size)))
    h = h.view(h.size(0), -1)
    c = c.view(h.size(0), -1)
    
    x = x.view(x.size(0), -1)
    # Linear mappings
    i_t = th.mm(x, self.w_xi.t()) + th.mm(h, self.w_hi) + self.b_i
    f_t = th.mm(x, self.w_xf.t()) + th.mm(h, self.w_hf) + self.b_f
    o_t = th.mm(x, self.w_xo.t()) + th.mm(h, self.w_ho) + self.b_o
    # activations
    i_t.sigmoid_()
    f_t.sigmoid_()
    o_t.sigmoid_()
    # cell computations
    c_t = th.mm(x, self.w_xc.t()) + th.mm(h, self.w_hc) + self.b_c
    c_t.tanh_()
    c_t = th.mul(c, f_t) + th.mul(i_t, c_t)
    h_t = th.mul(o_t, th.tanh(c_t))
    # Reshape for compatibility
    h_t = h_t.view(h_t.size(0), 1, -1)
    c_t = c_t.view(c_t.size(0), 1, -1)
    if self.dropout > 0.0:
        F.dropout(h_t, p=self.dropout, training=self.training, inplace=True)
        
    out = self.fc(h_t)         
    
    return out 
    #return h_t, (h_t, c_t)

def sample_mask(self):
    pass

“”“## Training”“”

num_epochs = 20000
learning_rate = 1e-3#0.01

input_size = 4
hidden_size = 3#2
num_layers = 1

num_classes = 1

lstm = SlowLSTM(input_size, hidden_size)

criterion = torch.nn.MSELoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)