Reproducing LSTM implementation : Dropout

I’m trying to reproduce the LSTM implementation of Pytorch by implementing my own module to understand it better.

This is my code so far :

import math
import torch
from torch import nn

class MyLSTM(nn.Module):
    def __init__(self, inp_dim, hidden_dim, n_layers=1, dropout=0.0, batch_first=False):
        super().__init__()

        self.inp_dim = inp_dim
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.batch_first = batch_first

        self.layers = nn.ModuleList([
            nn.LSTMCell(inp_dim, hidden_dim) if i == 0 else nn.LSTMCell(hidden_dim, hidden_dim)
            for i in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout) if dropout > 0 else nn.Identity()

    def forward(self, x, hidden=None):
        batch_dim = 0 if self.batch_first else 1
        seq_dim = 1 if self.batch_first else 0

        if hidden is None:
            batch_size = x.size(batch_dim)
            hx, cx = torch.zeros(self.n_layers, batch_size, self.hidden_dim, dtype=x.dtype, device=x.device), torch.zeros(self.n_layers, batch_size, self.hidden_dim, dtype=x.dtype, device=x.device)
        else:
            hx, cx = hidden

        outputs = []
        for ts in range(x.size(seq_dim)):
            x_ts = x[:, ts] if self.batch_first else x[ts]
            hx, cx = self.forward_timestep(x_ts, hx, cx)
            outputs.append(hx[-1])

        return torch.stack(outputs, dim=seq_dim), (hx, cx)

    def forward_timestep(self, x, hx, cx):
        hx = hx.chunk(self.n_layers, 0)
        cx = cx.chunk(self.n_layers, 0)
        prev_output = x
        states = []
        outputs = []
        for i, layer in enumerate(self.layers):
            output, state = layer(prev_output, (hx[i].squeeze(0), cx[i].squeeze(0)))
            if i < self.n_layers - 1:
                output = self.dropout(output)
            outputs.append(output)
            states.append(state)
            prev_output = output

        return torch.stack(outputs), torch.stack(states)

I test the equivalency with this code :

    torch.manual_seed(42)
    b = nn.LSTM(5, 4, num_layers=3, dropout=0.0)

    torch.manual_seed(42)
    m = MyLSTM(5, 4, n_layers=3, dropout=0.0)

    assert torch.equal(b.weight_ih_l0, m.layers[0].w_ih), f"Weights are different for [0]w_ih !\nStandard model :\n{b.weight_ih_l0}\n\nCustom model :\n{m.layers[0].w_ih}"
    assert torch.equal(b.weight_hh_l0, m.layers[0].w_hh), f"Weights are different for [0]w_hh !\nStandard model :\n{b.weight_hh_l0}\n\nCustom model :\n{m.layers[0].w_hh}"
    assert torch.equal(b.bias_ih_l0, m.layers[0].b_ih), f"Bias are different for [0]b_ih !\nStandard model :\n{b.bias_ih_l0}\n\nCustom model :\n{m.layers[0].b_ih}"
    assert torch.equal(b.bias_hh_l0, m.layers[0].b_hh), f"Bias are different for [0]b_hh !\nStandard model :\n{b.bias_hh_l0}\n\nCustom model :\n{m.layers[0].b_hh}"
    assert torch.equal(b.weight_ih_l1, m.layers[1].w_ih), f"Weights are different for [1]w_ih !\nStandard model :\n{b.weight_ih_l1}\n\nCustom model :\n{m.layers[1].w_ih}"
    assert torch.equal(b.weight_hh_l1, m.layers[1].w_hh), f"Weights are different for [1]w_hh !\nStandard model :\n{b.weight_hh_l1}\n\nCustom model :\n{m.layers[1].w_hh}"
    assert torch.equal(b.bias_ih_l1, m.layers[1].b_ih), f"Bias are different for [1]b_ih !\nStandard model :\n{b.bias_ih_l1}\n\nCustom model :\n{m.layers[1].b_ih}"
    assert torch.equal(b.bias_hh_l1, m.layers[1].b_hh), f"Bias are different for [1]b_hh !\nStandard model :\n{b.bias_hh_l1}\n\nCustom model :\n{m.layers[1].b_hh}"
    assert torch.equal(b.weight_ih_l2, m.layers[2].w_ih), f"Weights are different for [2]w_ih !\nStandard model :\n{b.weight_ih_l2}\n\nCustom model :\n{m.layers[2].w_ih}"
    assert torch.equal(b.weight_hh_l2, m.layers[2].w_hh), f"Weights are different for [2]w_hh !\nStandard model :\n{b.weight_hh_l2}\n\nCustom model :\n{m.layers[2].w_hh}"
    assert torch.equal(b.bias_ih_l2, m.layers[2].b_ih), f"Bias are different for [2]b_ih !\nStandard model :\n{b.bias_ih_l2}\n\nCustom model :\n{m.layers[2].b_ih}"
    assert torch.equal(b.bias_hh_l2, m.layers[2].b_hh), f"Bias are different for [2]b_hh !\nStandard model :\n{b.bias_hh_l2}\n\nCustom model :\n{m.layers[2].b_hh}"

    b_n_params = sum(p.numel() for p in b.parameters() if p.requires_grad)
    m_n_params = sum(p.numel() for p in m.parameters() if p.requires_grad)
    assert b_n_params == m_n_params, f"They don't have the same parameters : {b_n_params} vs {m_n_params}"
    print(f"LSTM has {b_n_params} parameters")

    x = torch.rand(6, 8, 5)
    torch.manual_seed(42)
    b_y1, b_h1 = b(x)
    torch.manual_seed(42)
    m_y1, m_h1 = m(x)
    assert torch.allclose(b_y1, m_y1), f"Tensors are different for y1 !\nStandard model :\n{b_y1}\n\nCustom model :\n{m_y1}"
    b_h, b_c = b_h1
    m_h, m_c = m_h1
    assert torch.allclose(b_h, m_h), f"Tensors are different for h1_h !\nStandard model :\n{b_h}\n\nCustom model :\n{m_h}"
    assert torch.allclose(b_c, m_c), f"Tensors are different for h1_c !\nStandard model :\n{b_c}\n\nCustom model :\n{m_c}"

    x = torch.rand(3, 8, 5)
    torch.manual_seed(42)
    b_y2, b_h2 = b(x, b_h1)
    torch.manual_seed(42)
    m_y2, m_h2 = m(x, m_h1)
    assert torch.allclose(b_y2, m_y2), f"Tensors are different for y1 !\nStandard model :\n{b_y2}\n\nCustom model :\n{m_y2}"
    b_h, b_c = b_h2
    m_h, m_c = m_h2
    assert torch.allclose(b_h, m_h, rtol=1e-04), f"Tensors are different for h2_h !\nStandard model :\n{b_h}\n\nCustom model :\n{m_h}"
    assert torch.allclose(b_c, m_c, rtol=1e-04), f"Tensors are different for h2_c !\nStandard model :\n{b_c}\n\nCustom model :\n{m_c}"

It works great when dropout is 0 (both implementations return the same tensors given the same inputs and same weights).
But if I change the dropout to something else, my implementation gives different results…

I was careful to not apply dropout at the last LSTM layer. So what am I doing wrong ??

I assume that everything works if you set b.eval() and m.eval() which turns of Dropout. Maybe the docs help.

I’m trying something similar: I want to check of 2 implementations of the same model – i.e., different syntax – yield the same output for the same input. If I set the seed for calling forward for each model and the same input, I indeed see the same output. However, I don’t really compare with a re-implementation of the while LSTM layer :).

1 Like

Yes, turning off dropout (either with a dropout rate set to 0 or setting the model to eval) make the code output the right values (my implementation leading to the same output as the standard implementation).

If I set the seed for calling forward for each model and the same input, I indeed see the same output.

That’s what I’m doing as well, but yet the results are different !
Hence my question. My guess is that I’m using the Dropout wrongly, but I don’t know why…