LSTM Give Same output for any input

dachosen1 · May 19, 2020, 5:58pm

I’m training an LSTM model to recognize patterns from different voices and classify them into two categories: Male and Female. I applied a Mel-frequency cepstrum transformation and captured 1/10 of a second long signal which equals an array of 512. I have about 30 million sequences for training. But I’m using a small percentage to experiment before training on the entire data.

I’m using an LSTM to classify the sequence to male or female, each time I run the model I get the same prediction with the same probability. I can’t figure out what’s wrong.

class AudioLSTM(nn.Module):

    def __init__(
            self,
            batch_size = 256,
            input_size = 512,
            hidden_size: 126,
            dropout: 0.3,
            num_layer: 126,
            output_size: 2,
            batch: bool = True,
            bidirectional: bool = True
    ) -> None:
        super(AudioLSTM, self).__init__()

        self.input_size = input_size 
        self.hidden_size = hidden_size
        self.num_layer = num_layer
        self.dropout = dropout
        self.output_size = output_size
        self.batch_size = batch_size

        
        self.RNN_TYPE = nn.LSTM(
                input_size=input_size,
                hidden_size=hidden_size,
                num_layers=num_layers,
                dropout=dropout,
                batch_first=batch,
                bidirectional=bidirectional,
            )

        self.dropout = nn.Dropout(dropout)
        self.out = nn.Sigmoid()

    def forward(self, mfcc, hidden):
	

        seq_length = mfcc.shape[1]  # seq_length = 512    #mfcc = torch.Size([256, 512, 1])
        mfcc_reshape = mfcc.float().view(1, -1, seq_length)  # mfcc_reshape = torch.Size([1, 256, 512])
        lstm_out, hidden = self.RNN_TYPE(mfcc_reshape, hidden) # lstm_out,  = torch.Size([1, 256, 252])
        lstm_out = self.dropout(lstm_out)
        final_layer = lstm_out.view(self.batch_size, -1, self.output_size)[:, -1]  # torch.Size([256, 2]) 
        layer_prob = torch.sigmoid(final_layer)  # layer_prob = torch.Size([256, 2])

        return layer_prob, hidden

    def init_hidden(self, batch_size: int):

        weight = next(self.parameters()).data

        if torch.cuda.is_available():
            hidden = (
                weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_().cuda(),
                weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_().cuda(),
            )

        else:
            hidden = (
                weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_(),
                weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_(),
            )

        return hidden

The latest run was for using a small percentage: see below for results.

Epoch: 1/1... Step: 20... Training Loss: 0.692648... Validation Loss: 0.692760 Train Accuracy: 0.531250 Test Accuracy: 0.519531
Epoch: 1/1... Step: 30... Training Loss: 0.692848... Validation Loss: 0.693522 Train Accuracy: 0.523438 Test Accuracy: 0.476562
Epoch: 1/1... Step: 40... Training Loss: 0.691812... Validation Loss: 0.693789 Train Accuracy: 0.546875 Test Accuracy: 0.472656