How to make LSTM's initial hidden state learnable during training?

Hi all,

Please find current implementation of LSTM classifier I am using below:

class LSTMClassifier(nn.Module):
    def __init__(self, embedding_dim, hidden_dim, vocab_size, label_size):
        super(LSTMClassifier, self).__init__()
        self.hidden_dim = hidden_dim
        self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)
        self.hidden2label = nn.Linear(hidden_dim, label_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # the first is the hidden h
        # the second is the cell  c
        return (autograd.Variable(torch.zeros(1, 1, self.hidden_dim)),
                autograd.Variable(torch.zeros(1, 1, self.hidden_dim)))

    def forward(self, sentence):
        embeds = self.word_embeddings(sentence)
        x = embeds.view(len(sentence), 1, -1)
        lstm_out, self.hidden = self.lstm(x, self.hidden)
        y  = self.hidden2label(lstm_out[-1])
        log_probs = F.log_softmax(y)
        return log_probs

During training the network, I am detaching the hidden state from it’s history on last batch.

def train(..., ...):
    for batch in batches:
        model.hidden = model.init_hidden()

However, I want to learn the params in the initial hidden state. What’s the proper way to make the initial hidden state learnable during training?

In the __init__() function, try making the hidden and cell state as nn.Parameters.

self.hidden = nn.Parameter(torch.zeros(1, 1, self.hidden_dim))
self.cell_state = nn.Parameter(torch.zeros(1, 1, self.hidden_dim))

And no need to init_hidden() for every batch, as they are learned.