Batching training DAE with LSTM encoder, decoder properly

Dylan_Yung · February 12, 2020, 4:43pm

def denoise_train(x: DataLoader):
    loss = 0
    x_padded = list(map(lambda s: pad_string(s), x))
    x_idx_tensor = strings_to_index_tensor(x_padded)
    noisy_x = list(map(lambda s: noise_name(s), x))
    noisy_x_padded = list(map(lambda s: pad_string(s), noisy_x))
    noisy_x_idx_tensor = strings_to_index_tensor(noisy_x_padded)
    noisy_x_rnn_tensor = to_rnn_tensor(noisy_x_idx_tensor)
    batch_sz = len(x)
    encoder_hidden = encoder.init_hidden(batch_size=batch_sz)

    for i in range(noisy_x_rnn_tensor.shape[0]):
       _, encoder_hidden = encoder(noisy_x_rnn_tensor[i].unsqueeze(0), encoder_hidden)
    
    decoder_input = strings_to_tensor([SOS] * batch_sz)
    decoder_hidden = encoder_hidden
    names = [''] * batch_sz

    for i in range(x_idx_tensor.shape[0]):
        decoder_probs, decoder_hidden = decoder(decoder_input, decoder_hidden)
        nonzero_indexes = x_idx_tensor[i]
        best_indexes = torch.squeeze(torch.argmax(decoder_probs, dim=2), dim=0)
        decoder_probs = torch.squeeze(decoder_probs, dim=0)
        best_chars = list(map(lambda idx: index_to_char(int(idx)), best_indexes))
        loss += criterion(decoder_probs, nonzero_indexes.type(torch.LongTensor))

        for i, char in enumerate(best_chars):
            names[i] += char

        decoder_input = strings_to_tensor(best_chars)

    loss.backward()
    return names, noisy_x, loss.item()

I have this code for a denoising autoencoder on first names. It takes in a first name then pre-pads it with PAD char which I denote. I set it up like this so I could do batch training, but after training it for a couple days, I ran a name through it and it just printed out pads, which makes sense because it backprops on every iteration of a char so it’s being rewarded for just generating pads. How do I set it up not to learn this? I need the padding for batch training, because the names aren’t all consistent length so the padding allows them to be a consistent length. Currently when I put a noised name through it after training it just puts out all pads