I have an application for a network which must receives multiple inputs (N > 3). So, I have an input tensor of (batch size, no channels, N, w, h). In my network, I have to extract features from each image from those N and to feed them into an LSTM. It is correct to write the network as below (especially the forward method) ?
I suspect that the training process is not performed well. Do you think the forward method is ok?
from networks.ResNet import resnet18 import torch.nn as nn import torch class ReccurentNet(nn.Module): def __init__(self, seq_len, n_features, hidden_dim=64): super(ReccurentNet, self).__init__() self.seq_len, self.n_features = seq_len, n_features self.rnn1 = nn.LSTM( input_size=n_features, hidden_size=hidden_dim, num_layers=1, batch_first=True ) def forward(self, x): x, (_, _) = self.rnn1(x) x, (hidden_n, _) = self.rnn2(x) return hidden_n class ConvRecNet(nn.Module): def __init__(self, hidden_dim=128): super(ConvRecNet, self).__init__() self.backbone = resnet18() # backbone features extractor, output 512 features self.rnn = nn.LSTM( input_size=512, hidden_size=hidden_dim, num_layers=5, batch_first=True ) self.activation = nn.LeakyReLU() self.fc = nn.Linear(128, 24) def forward(self, x): windows =  for i in range(0, x.shape): windows.append(self.backbone(x[:, :, i, :, :])) x = torch.stack(windows) x, _ = self.rnn(x) x = x[-1] # Take the last output from rnn, is it ok ? x = self.activation(x) x = self.fc(x) return x