Apologies for cross-posting.
I am trying to create a basic speaker verification model using the simplest possible RNN, but the loss function is not reducing. I am using speech from the LibriSpeech dataset. There are 40 speakers and the clips are of varying length. I have therefore implemented the RNN with a batch size of 1, as I can’t stack the samples into a batch together.
What I am noticing is that the model is learning to output the class at index 2 as a very high probability. The probability of this class gets higher as the training goes on. There must be some basic error in how I am passing data to the model, but I just can’t figure it out. Any help appreciated.
import torch
import librosa
import librosa.display
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.datasets import LIBRISPEECH
from torch.utils.data import DataLoader
from google.colab import drive
drive.mount('/content/drive')
speaker_training_data = LIBRISPEECH("/content/drive/MyDrive/cc/librispeech", "dev-clean", download=False) # Change download=True if you don't have this stored on your gDrive
speakers_list = [84, 174, 251, 422, 652, 777, 1272, 1462, 1673, 1919, 1988,
1993, 2035, 2078, 2086, 2277, 2412, 2428, 2803, 2902, 3000,
3081, 3170, 3536, 3576, 3752, 3853, 5338, 5536, 5694, 5895,
6241, 6295, 6313, 6319, 6345, 7850, 7976, 8297, 8842]
labels_map = {k: v for v, k in enumerate(speakers_list)}
num_examples = speaker_training_data.__len__()
num_classes = len(speakers_list)
num_epochs = 100
batch_size = 1
learning_rate = 0.0001
input_size = 128
sequence_length = 0
hidden_size = 768
num_layers = 1
n_mels = input_size
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
class SpeakerEncoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(SpeakerEncoder, self).__init__()
self.num_layers = num_layers
self.hidden_size = hidden_size
self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
# x: (batch_size, seq_length, input_size)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, x):
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(DEVICE)
out, _ = self.rnn(x, h0)
# out: batch_size, seq_length, hidden_size
# out (N, L, 64)
out = out[:, -1, :]
out = self.fc(out)
return out
model = SpeakerEncoder(input_size, hidden_size, num_layers, num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
m = nn.Sigmoid()
def extract_features(wav, sr):
wav = wav.squeeze()
wav = wav.numpy()
wav = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=1024, hop_length=256, win_length=1024, n_mels=n_mels)
wav = librosa.amplitude_to_db(np.abs(wav), ref=np.max(wav))
input = torch.tensor(wav).unsqueeze(dim=1)
input = input.permute(1, 2, 0) # input: batch size, seq_length, input_size
# librosa.display.specshow(spec)
return input
# Randomly indices from the list of training examples
indices_for_train = torch.randint(low=0, high=num_examples-1, size=(1,num_examples), generator=torch.Generator().manual_seed(2147483647))
running_loss = 0.0
for epoch in range(num_epochs):
for idx in range(len(indices_for_train[0,:])):
wav, sr, _, s_id, _, _ = speaker_training_data[indices_for_train[:,idx]]
input = extract_features(wav, sr).to(DEVICE)
target = torch.zeros(num_classes)
target[labels_map[s_id]] = 1 # Encode one-hot target vector
target = target.unsqueeze(dim=0).to(DEVICE) # Add batch dimension
# zero gradients
optimizer.zero_grad()
# forward + backward + optimize
output = m(model(input))
loss = criterion(output, target)
scheduler.step(loss)
loss.backward()
optimizer.step()
running_loss += loss
if idx % 10 == 0: # print every 10 mini-batches
print(f"[{epoch + 1}, {idx + 1:5d}] loss: {running_loss / 10:.3f}")
running_loss = 0.0
Example output at 1 and 581 steps:
[1, 1] loss: 0.367
tensor([[0.4999, 0.3682, 0.5230, 0.5431, 0.5309, 0.4769, 0.4652, 0.6448, 0.5828,
0.4462, 0.5334, 0.5678, 0.6319, 0.6079, 0.3966, 0.6595, 0.5048, 0.5363,
0.4451, 0.2818, 0.6830, 0.5108, 0.6246, 0.3642, 0.3661, 0.6992, 0.3814,
0.4018, 0.3842, 0.2558, 0.3839, 0.4047, 0.5563, 0.3381, 0.3728, 0.6235,
0.3917, 0.4634, 0.7056, 0.5572]], device='cuda:0',
grad_fn=<SigmoidBackward0>)
[1, 581] loss: 3.692
tensor([[0.1987, 0.1686, 0.8574, 0.7862, 0.3269, 0.2106, 0.2115, 0.6590, 0.2302,
0.1253, 0.2404, 0.4212, 0.2641, 0.3300, 0.2433, 0.3660, 0.2477, 0.5319,
0.4255, 0.1146, 0.3354, 0.2716, 0.2908, 0.1727, 0.1950, 0.3938, 0.2844,
0.2152, 0.5146, 0.3037, 0.1507, 0.1472, 0.4040, 0.1506, 0.1563, 0.3281,
0.2195, 0.1958, 0.3725, 0.2938]], device='cuda:0',
grad_fn=<SigmoidBackward0>)