I’ve seen examples where users are initializing a hidden parameter for a LSTM network. I was under the impression that the hidden states were managed under the hood. What’s the advantage of manually initializing the hidden parameters.
class CharLSTM(nn.Module):
def __init__(self, tokens, n_hidden=256, n_layers=2, drop_prob=0.5, lr=0.001): super().__init__() self.drop_prob = drop_prob self.n_layers = n_layers self.n_hidden = n_hidden self.lr = lr self.chars = tokens self.int2char = dict(enumerate(self.chars)) self.char2int = {ch: ii for ii, ch in self.int2char.items()} self.lstm = nn.LSTM(len(self.chars), n_hidden, n_layers, dropout=drop_prob, batch_first=True) self.dropout = nn.Dropout(drop_prob) self.fc = nn.Linear(n_hidden, len(self.chars)) def forward(self, x, hidden): ''' Forward pass through the network. These inputs are x, and the hidden/cell state `hidden`. ''' r_output, hidden = self.lstm(x, hidden) out = self.dropout(r_output) out = out.contiguous().view(-1, self.n_hidden) out = self.fc(out) return out, hidden def init_hidden(self, batch_size): ''' Initializes hidden state ''' weight = next(self.parameters()).data if (train_on_gpu): hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda(), weight.new(self.n_layers, batch_size, self.n_hidden).zero_().cuda()) else: hidden = (weight.new(self.n_layers, batch_size, self.n_hidden).zero_(), weight.new(self.n_layers, batch_size, self.n_hidden).zero_()) return hidden