I’m training an LSTM model to recognize patterns from different voices and classify them into two categories: Male and Female. I applied a Mel-frequency cepstrum transformation and captured 1/10 of a second long signal which equals an array of 512. I have about 30 million sequences for training. But I’m using a small percentage to experiment before training on the entire data.
I’m using an LSTM to classify the sequence to male or female, each time I run the model I get the same prediction with the same probability. I can’t figure out what’s wrong.
class AudioLSTM(nn.Module):
def __init__(
self,
batch_size = 256,
input_size = 512,
hidden_size: 126,
dropout: 0.3,
num_layer: 126,
output_size: 2,
batch: bool = True,
bidirectional: bool = True
) -> None:
super(AudioLSTM, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layer = num_layer
self.dropout = dropout
self.output_size = output_size
self.batch_size = batch_size
self.RNN_TYPE = nn.LSTM(
input_size=input_size,
hidden_size=hidden_size,
num_layers=num_layers,
dropout=dropout,
batch_first=batch,
bidirectional=bidirectional,
)
self.dropout = nn.Dropout(dropout)
self.out = nn.Sigmoid()
def forward(self, mfcc, hidden):
seq_length = mfcc.shape[1] # seq_length = 512 #mfcc = torch.Size([256, 512, 1])
mfcc_reshape = mfcc.float().view(1, -1, seq_length) # mfcc_reshape = torch.Size([1, 256, 512])
lstm_out, hidden = self.RNN_TYPE(mfcc_reshape, hidden) # lstm_out, = torch.Size([1, 256, 252])
lstm_out = self.dropout(lstm_out)
final_layer = lstm_out.view(self.batch_size, -1, self.output_size)[:, -1] # torch.Size([256, 2])
layer_prob = torch.sigmoid(final_layer) # layer_prob = torch.Size([256, 2])
return layer_prob, hidden
def init_hidden(self, batch_size: int):
weight = next(self.parameters()).data
if torch.cuda.is_available():
hidden = (
weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_().cuda(),
weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_().cuda(),
)
else:
hidden = (
weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layer * self.output_size, batch_size, self.hidden_size).zero_(),
)
return hidden
The latest run was for using a small percentage: see below for results.
Epoch: 1/1... Step: 20... Training Loss: 0.692648... Validation Loss: 0.692760 Train Accuracy: 0.531250 Test Accuracy: 0.519531
Epoch: 1/1... Step: 30... Training Loss: 0.692848... Validation Loss: 0.693522 Train Accuracy: 0.523438 Test Accuracy: 0.476562
Epoch: 1/1... Step: 40... Training Loss: 0.691812... Validation Loss: 0.693789 Train Accuracy: 0.546875 Test Accuracy: 0.472656