It’s a good convention. In that example they create a custom model and the hidden_dim defines the output size they want from the LSTM. Bidirectional has twice the amount of hidden variables so if you wan’t to keep the final output the same you have to divide the hidden_dim by 2.
hidden_size = 128
lstm = nn.LSTM(10, hidden_size, num_layers=1, batch_first=True, bidirectional=False)
hidden_vect_1 = (
Variable(torch.zeros(1, 1, hidden_size)),
Variable(torch.zeros(1, 1, hidden_size)))
output, hidden = lstm(Variable(torch.rand(1, 5, 10)), hidden_vect_1)
print('Output size:', output.size(), '- Hidden size:', [h.size() for h in hidden])
# Output size: torch.Size([1, 5, 128]) - Hidden size: [torch.Size([1, 1, 128]), torch.Size([1, 1, 128])]
lstm = nn.LSTM(10, hidden_size, num_layers=1, batch_first=True, bidirectional=True)
hidden_vect_1 = (
Variable(torch.zeros(2, 1, hidden_size)),
Variable(torch.zeros(2, 1, hidden_size)))
output, hidden = lstm(Variable(torch.rand(1, 5, 10)), hidden_vect_1)
print('Output size:', output.size(), '- Hidden size:', [h.size() for h in hidden])
# Output size: torch.Size([1, 5, 256]) - Hidden size: [torch.Size([2, 1, 128]), torch.Size([2, 1, 128])]
# Note that the output is twice as large, and also that the hidden are [2,1,128] instead of [1,1,128]