Network inference with multiple inputs

Hi,
I have an application for a network which must receives multiple inputs (N > 3). So, I have an input tensor of (batch size, no channels, N, w, h). In my network, I have to extract features from each image from those N and to feed them into an LSTM. It is correct to write the network as below (especially the forward method) ?

I suspect that the training process is not performed well. Do you think the forward method is ok?

from networks.ResNet import resnet18
import torch.nn as nn
import torch


class ReccurentNet(nn.Module):

    def __init__(self, seq_len, n_features, hidden_dim=64):
        super(ReccurentNet, self).__init__()

        self.seq_len, self.n_features = seq_len, n_features

        self.rnn1 = nn.LSTM(
            input_size=n_features,
            hidden_size=hidden_dim,
            num_layers=1,
            batch_first=True
        )

    def forward(self, x):
        x, (_, _) = self.rnn1(x)
        x, (hidden_n, _) = self.rnn2(x)

        return hidden_n


class ConvRecNet(nn.Module):
    def __init__(self, hidden_dim=128):
        super(ConvRecNet, self).__init__()
        self.backbone = resnet18() # backbone features extractor, output 512 features

        self.rnn = nn.LSTM(
            input_size=512,
            hidden_size=hidden_dim,
            num_layers=5,
            batch_first=True
        )

        self.activation = nn.LeakyReLU()
        self.fc = nn.Linear(128, 24)

    def forward(self, x):
        windows = []
        for i in range(0, x.shape[2]):
            windows.append(self.backbone(x[:, :, i, :, :]))

        x = torch.stack(windows)
        x, _ = self.rnn(x)
        x = x[-1] # Take the last output from rnn, is it  ok ?

        x = self.activation(x)
        x = self.fc(x)
        return x

Thank you!