Get the last hidden state in stacked LSTM

Hello everyone,

I’m developing a classifier based on LSTM and I defined the model in this way:

class LSTMClassifier(nn.Module):

    def __init__(self, input_size, seq_len, n_classes, hidden_size, device, dim_feedforward=1024, num_layers=1, dropout=0, bidirectional=False, batch_first=True):
        super(LSTMClassifier, self).__init__()

        self.hidden_size = hidden_size
        self.seq_len = seq_len
        self.input_size = input_size
        self.num_layers = num_layers
        self.device = device

        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout, bidirectional=bidirectional, batch_first=batch_first)
        self.classifier = nn.Linear(hidden_size, n_classes)
    
    def init_hidden_state(self, x):
        return (
            torch.zeros(self.num_layers, x.size()[0], self.hidden_size).to(self.device),
            torch.zeros(self.num_layers, x.size()[0], self.hidden_size).to(self.device)
        )
    
    def forward(self, x):
        self.hidden = self.init_hidden_state(x)

        out, (hn, cn) = self.lstm(x, self.hidden)
        hn = hn.view(-1, self.hidden_size)
        y = self.classifier(hn)
        
        return y

It should be correct. If I create a model with hidden_size=64 and num_layers=1, the size of hn (before view) is [1,1,64] when I give in input 1 sample.
If I change num_layers=2, the size of hn (before view) is [2,1,64] because it concatenates the hidden state of the two stacked layers.

My doubt is: what is the order of hidden states in hn? The first element in hn is the hidden state of the first or of the last LSTM layer?
Is it correct to consider the last hidden state of the last LSTM layer to classify the sequences?

Thanks for your help.