LSTM Autoencoder variable length sequences

Hi, I am trying to train an LSTM Autoencoder and I have variable length sequences. I am feeding the sequences to the network singularly, not in batches (therefore I can’t use pack_padded_sequences).

I have manually padded the sequences with 0s up to the maximum sequence length and I am feeding the padded sequences to the LSTM layer. My question here is: does the LSTM layer recognise that each incoming sequence is padded or do I need to feed the sequence without padding example by example??

Second question is: would it speed up the learning to train in batches??? If so, how can I implement it and what batch size do you suggest?

Thank you. Here is my code (X is an array of size (n_sequences, max_seq_len, n_features) containing the already padded sequences):

X_train, X_val, seq_lengths_train, seq_lengths_val = train_test_split(
    X, trials_array,
    test_size=0.30,
    random_state=RANDOM_SEED
)

X_val, X_test, seq_lengths_val, seq_lengths_test = train_test_split(
    X_val, seq_lengths_val,
    test_size=0.5, 
    random_state=RANDOM_SEED
)

X_train = X_train.reshape(-1,190)
X_test = X_test.reshape(-1,190)
X_val = X_val.reshape(-1,190)

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

X_train = X_train.reshape(-1,56,190)
X_test = X_test.reshape(-1,56,190)
X_val = X_val.reshape(-1,56,190)

X_train = torch.from_numpy(X_train).double()
X_val = torch.from_numpy(X_val).double()
X_test = torch.from_numpy(X_test).double()

n_seq, max_seq_len, n_features = X_train.shape # n_seq = 7000, max_seq_len = 56, n_features = 190

class Encoder(nn.Module):

    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(Encoder, self).__init__()

        self.seq_len, self.n_features = seq_len, n_features
        self.embedding_dim, self.hidden_dim = embedding_dim, 2 * embedding_dim

        self.rnn1 = nn.LSTM(
          input_size=n_features,
          hidden_size=self.hidden_dim,
          num_layers=1,
          batch_first=True
        )

        self.rnn2 = nn.LSTM(
          input_size=self.hidden_dim,
          hidden_size=embedding_dim,
          num_layers=1,
          batch_first=True
        )

    def forward(self, x):
        x = x.reshape((1, self.seq_len, self.n_features))
        x, (_, _) = self.rnn1(x)
        x, (hidden_n, _) = self.rnn2(x)

        return hidden_n.reshape((1, self.embedding_dim))
    
class Decoder(nn.Module):

    def __init__(self, seq_len, input_dim=64, n_features=1):
        super(Decoder, self).__init__()

        self.seq_len, self.input_dim = seq_len, input_dim
        self.hidden_dim, self.n_features = 2 * input_dim, n_features

        self.rnn1 = nn.LSTM(
          input_size=input_dim,
          hidden_size=input_dim,
          num_layers=1,
          batch_first=True
        )

        self.rnn2 = nn.LSTM(
          input_size=input_dim,
          hidden_size=self.hidden_dim,
          num_layers=1,
          batch_first=True
        )

        self.output_layer = nn.Linear(self.hidden_dim, n_features)

    def forward(self, x):
        x = x.repeat(self.seq_len, 1)
        x = x.reshape((1, self.seq_len, self.input_dim))

        x, (hidden_n, cell_n) = self.rnn1(x)
        x, (hidden_n, cell_n) = self.rnn2(x)
        x = x.reshape((self.seq_len, self.hidden_dim))
        x = self.output_layer(x)

        return x
    
class RecurrentAutoencoder(nn.Module):

    def __init__(self, seq_len, n_features, embedding_dim=64):
        super(RecurrentAutoencoder, self).__init__()

        self.encoder = Encoder(seq_len, n_features, embedding_dim).to(device)
        self.decoder = Decoder(seq_len, embedding_dim, n_features).to(device)

    def forward(self, x):
        x = self.encoder(x)
        x = self.decoder(x)

        return x
    
model = RecurrentAutoencoder(max_seq_len, n_features, 128)
model = model.to(device)

def train_model(model, train_dataset, val_dataset, seq_lengths_train, seq_lengths_val, n_epochs):
    optimizer = optim.Adam(model.parameters(), lr=1e-3)
    criterion = nn.L1Loss(reduction='sum').to(device)
    history = dict(train=[], val=[])

    for epoch in range(1, n_epochs + 1):
        model = model.double()
        model = model.train()

        train_losses = []

        for seq_true in train_dataset:
            optimizer.zero_grad()
            seq_true = seq_true.to(device)
            seq_pred = model(seq_true)
           
            loss = criterion(seq_pred, seq_true)
            loss.backward()
            optimizer.step()

            train_losses.append(loss.item())

        val_losses = []
        model = model.eval()
        with torch.no_grad():
            for seq_true in val_dataset:
                seq_true = seq_true.to(device)
                seq_pred = model(seq_true)

                loss = criterion(seq_pred, seq_true)
                val_losses.append(loss.item())

        train_loss = np.mean(train_losses)
        val_loss = np.mean(val_losses)

        history['train'].append(train_loss)
        history['val'].append(val_loss)

        print(f'Epoch {epoch}: train loss {train_loss} val loss {val_loss}')

    return model.eval(), history

model, history = train_model(
    model, 
    X_train,
    X_val,
    seq_lengths_train, 
    seq_lengths_val,
    n_epochs=150
)

Hi, I have a similar situation but I would like to avoid using padded sequences to not affect too much the network, and passing one sequence at the time is extremely unefficient. Did you manage to find an answer to this problem?