LSTM Autoencoder variable length sequences

Hi, I am trying to train an LSTM Autoencoder and I have variable length sequences. I am feeding the sequences to the network singularly, not in batches (therefore I can’t use pack_padded_sequences).

I have manually padded the sequences with 0s up to the maximum sequence length and I am feeding the padded sequences to the LSTM layer. My question here is: does the LSTM layer recognise that each incoming sequence is padded or do I need to feed the sequence without padding example by example??

Second question is: would it speed up the learning to train in batches??? If so, how can I implement it and what batch size do you suggest?

Thank you. Here is my code (X is an array of size (n_sequences, max_seq_len, n_features) containing the already padded sequences):

X_train, X_val, seq_lengths_train, seq_lengths_val = train_test_split(
    X, trials_array,
    test_size=0.30,
    random_state=RANDOM_SEED
)

X_val, X_test, seq_lengths_val, seq_lengths_test = train_test_split(
    X_val, seq_lengths_val,
    test_size=0.5, 
    random_state=RANDOM_SEED
)

X_train = X_train.reshape(-1,190)
X_test = X_test.reshape(-1,190)
X_val = X_val.reshape(-1,190)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = X_train.reshape(-1,56,190)
X_test = X_test.reshape(-1,56,190)
X_val = X_val.reshape(-1,56,190)

X_train = torch.from_numpy(X_train).double()
X_val = torch.from_numpy(X_val).double()
X_test = torch.from_numpy(X_test).double()

n_seq, max_seq_len, n_features = X_train.shape # n_seq = 7000, max_seq_len = 56, n_features = 190

class Encoder(nn.Module):

    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(Encoder, self).__init__()

        self.seq_len, self.n_features = seq_len, n_features
        self.embedding_dim, self.hidden_dim = embedding_dim, 2 * embedding_dim

        self.rnn1 = nn.LSTM(
          input_size=n_features,
          hidden_size=self.hidden_dim,
          num_layers=1,
          batch_first=True
        )

        self.rnn2 = nn.LSTM(
          input_size=self.hidden_dim,
          hidden_size=embedding_dim,
          num_layers=1,
          batch_first=True
        )

    def forward(self, x):
        x = x.reshape((1, self.seq_len, self.n_features))
        x, (_, _) = self.rnn1(x)
        x, (hidden_n, _) = self.rnn2(x)

        return hidden_n.reshape((1, self.embedding_dim))
    
class Decoder(nn.Module):

    def __init__(self, seq_len, input_dim=64, n_features=1):
        super(Decoder, self).__init__()

        self.seq_len, self.input_dim = seq_len, input_dim
        self.hidden_dim, self.n_features = 2 * input_dim, n_features

        self.rnn1 = nn.LSTM(
          input_size=input_dim,
          hidden_size=input_dim,
          num_layers=1,
          batch_first=True
        )

        self.rnn2 = nn.LSTM(
          input_size=input_dim,
          hidden_size=self.hidden_dim,
          num_layers=1,
          batch_first=True
        )

        self.output_layer = nn.Linear(self.hidden_dim, n_features)

    def forward(self, x):
        x = x.repeat(self.seq_len, 1)
        x = x.reshape((1, self.seq_len, self.input_dim))

        x, (hidden_n, cell_n) = self.rnn1(x)
        x, (hidden_n, cell_n) = self.rnn2(x)
        x = x.reshape((self.seq_len, self.hidden_dim))
        x = self.output_layer(x)

        return x
    
class RecurrentAutoencoder(nn.Module):

    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(RecurrentAutoencoder, self).__init__()

        self.encoder = Encoder(seq_len, n_features, embedding_dim).to(device)
        self.decoder = Decoder(seq_len, embedding_dim, n_features).to(device)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)

        return x
    
model = RecurrentAutoencoder(max_seq_len, n_features, 128)
model = model.to(device)

def train_model(model, train_dataset, val_dataset, seq_lengths_train, seq_lengths_val, n_epochs):
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.L1Loss(reduction='sum').to(device)
    history = dict(train=[], val=[])

    for epoch in range(1, n_epochs + 1):
        model = model.double()
        model = model.train()

        train_losses = []

        for seq_true in train_dataset:
            optimizer.zero_grad()
            seq_true = seq_true.to(device)
            seq_pred = model(seq_true)
           
            loss = criterion(seq_pred, seq_true)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        val_losses = []
        model = model.eval()
        with torch.no_grad():
            for seq_true in val_dataset:
                seq_true = seq_true.to(device)
                seq_pred = model(seq_true)

                loss = criterion(seq_pred, seq_true)
                val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        history['train'].append(train_loss)
        history['val'].append(val_loss)

        print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')

    return model.eval(), history

model, history = train_model(
    model, 
    X_train,
    X_val,
    seq_lengths_train, 
    seq_lengths_val,
    n_epochs=150
)