RNN speaker ID loss not reducing

Apologies for cross-posting.

I am trying to create a basic speaker verification model using the simplest possible RNN, but the loss function is not reducing. I am using speech from the LibriSpeech dataset. There are 40 speakers and the clips are of varying length. I have therefore implemented the RNN with a batch size of 1, as I can’t stack the samples into a batch together.

What I am noticing is that the model is learning to output the class at index 2 as a very high probability. The probability of this class gets higher as the training goes on. There must be some basic error in how I am passing data to the model, but I just can’t figure it out. Any help appreciated.

Full Colab notebook here

import torch
import librosa
import librosa.display
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from torchaudio.datasets import LIBRISPEECH
from torch.utils.data import DataLoader
from google.colab import drive
drive.mount('/content/drive')

speaker_training_data = LIBRISPEECH("/content/drive/MyDrive/cc/librispeech", "dev-clean", download=False) # Change download=True if you don't have this stored on your gDrive

speakers_list = [84, 174, 251, 422, 652, 777, 1272, 1462, 1673, 1919, 1988,
                 1993, 2035, 2078, 2086, 2277, 2412, 2428, 2803, 2902, 3000,
                 3081, 3170, 3536, 3576, 3752, 3853, 5338, 5536, 5694, 5895,
                 6241, 6295, 6313, 6319, 6345, 7850, 7976, 8297, 8842]

labels_map = {k: v for v, k in enumerate(speakers_list)}

num_examples = speaker_training_data.__len__()
num_classes = len(speakers_list)
num_epochs = 100
batch_size = 1
learning_rate = 0.0001
input_size = 128
sequence_length = 0
hidden_size = 768
num_layers = 1
n_mels = input_size

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

class SpeakerEncoder(nn.Module):
  def __init__(self, input_size, hidden_size, num_layers, num_classes):
    super(SpeakerEncoder, self).__init__()
    self.num_layers = num_layers
    self.hidden_size = hidden_size
    self.rnn = nn.RNN(input_size, hidden_size, num_layers, batch_first=True)
    # x: (batch_size, seq_length, input_size)
    self.fc = nn.Linear(hidden_size, num_classes)

  def forward(self, x):
    h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(DEVICE)

    out, _ = self.rnn(x, h0)
    # out: batch_size, seq_length, hidden_size
    # out (N, L, 64)
    out = out[:, -1, :]
    out = self.fc(out)
    return out

model = SpeakerEncoder(input_size, hidden_size, num_layers, num_classes).to(DEVICE)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min')
m = nn.Sigmoid()

def extract_features(wav, sr):
  wav = wav.squeeze()
  wav = wav.numpy()
  wav = librosa.feature.melspectrogram(y=wav, sr=sr, n_fft=1024, hop_length=256, win_length=1024, n_mels=n_mels)
  wav = librosa.amplitude_to_db(np.abs(wav), ref=np.max(wav))
  input = torch.tensor(wav).unsqueeze(dim=1)
  input = input.permute(1, 2, 0) # input: batch size, seq_length, input_size
  # librosa.display.specshow(spec)
  return input

# Randomly indices from the list of training examples
indices_for_train = torch.randint(low=0, high=num_examples-1, size=(1,num_examples), generator=torch.Generator().manual_seed(2147483647))
running_loss = 0.0

for epoch in range(num_epochs):
  for idx in range(len(indices_for_train[0,:])):
    wav, sr, _, s_id, _, _ = speaker_training_data[indices_for_train[:,idx]]
    input = extract_features(wav, sr).to(DEVICE)
    target = torch.zeros(num_classes)
    target[labels_map[s_id]] = 1  # Encode one-hot target vector
    target = target.unsqueeze(dim=0).to(DEVICE) # Add batch dimension

    # zero gradients
    optimizer.zero_grad()
    # forward + backward + optimize
    output = m(model(input))
    loss = criterion(output, target)
    scheduler.step(loss)
    loss.backward()
    optimizer.step()

    running_loss += loss
    if idx % 10 == 0:  # print every 10 mini-batches
      print(f"[{epoch + 1}, {idx + 1:5d}] loss: {running_loss / 10:.3f}")
      running_loss = 0.0

Example output at 1 and 581 steps:

[1,     1] loss: 0.367
tensor([[0.4999, 0.3682, 0.5230, 0.5431, 0.5309, 0.4769, 0.4652, 0.6448, 0.5828,
         0.4462, 0.5334, 0.5678, 0.6319, 0.6079, 0.3966, 0.6595, 0.5048, 0.5363,
         0.4451, 0.2818, 0.6830, 0.5108, 0.6246, 0.3642, 0.3661, 0.6992, 0.3814,
         0.4018, 0.3842, 0.2558, 0.3839, 0.4047, 0.5563, 0.3381, 0.3728, 0.6235,
         0.3917, 0.4634, 0.7056, 0.5572]], device='cuda:0',
       grad_fn=<SigmoidBackward0>)


[1,   581] loss: 3.692
tensor([[0.1987, 0.1686, 0.8574, 0.7862, 0.3269, 0.2106, 0.2115, 0.6590, 0.2302,
         0.1253, 0.2404, 0.4212, 0.2641, 0.3300, 0.2433, 0.3660, 0.2477, 0.5319,
         0.4255, 0.1146, 0.3354, 0.2716, 0.2908, 0.1727, 0.1950, 0.3938, 0.2844,
         0.2152, 0.5146, 0.3037, 0.1507, 0.1472, 0.4040, 0.1506, 0.1563, 0.3281,
         0.2195, 0.1958, 0.3725, 0.2938]], device='cuda:0',
       grad_fn=<SigmoidBackward0>)