Problems with Regression Network using mini-batches

I have a dataset that consists of 18 audio files with a duration of 5 minutes, annotated with affective labels (arousal) per timestep (25hz or per 0.04s). 9 of these files are used for evaluating performance.

For each audio file I create a feature vector with MFCC (20 bins) and MFCC deltas (also 20) per labeled timestep with overlapping.


def mfcc_features(filename, n_mfcc=20, n_mels=128, frame_time=0.08, hop_time=0.04):
  filepath = os.path.join(gf.audio_path[0], filename+'.wav')
  waveform, sample_rate = librosa.load(filepath, sr=None)
  frame_length = int(sample_rate * frame_time)
  hop_length = int(sample_rate * hop_time)
  melkwargs = {"n_fft" : frame_length, "n_mels" : n_mels, "hop_length": hop_length, 
              "f_min" : 0, "f_max" : None, "window_fn" : torch.hamming_window}
  mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc,  dct_type=2, norm='ortho', log_mels=True, melkwargs=melkwargs)(torch.from_numpy(waveform))[:,:-1]
  mfcc_deltas = torchaudio.functional.compute_deltas(mfcc, win_length=3)
  feature_vector =[mfcc, mfcc_deltas])
  return torch.FloatTensor(feature_vector).T

def label_vector(filename, target_value='arousal'):
  target_values = ['arousal', 'valence']
  if target_value not in target_values:
    raise ValueError("Invalid target value. Expected one of: %s" % target_values)

  filepath = os.path.join(gf.gold_standard_path[target_values.index(target_value)], filename+'.csv')
  df = pd.read_csv(filepath)

  return torch.FloatTensor(df['gold_standard'].values).unsqueeze(0).T

class AudioDataset(
  def __init__(self, list_IDs):
    self.list_IDs = list_IDs

  def __len__(self):
    return len(self.list_IDs)
  def __getitem__(self, index):
    # Select sample
    ID = self.list_IDs[index]

    # Load data and get label
    X = mfcc_features(ID)
    y = label_vector(ID)

    return X, y

Therefore I will have an X and y for each file with the following shape:

$ print(mfcc_features('P16').shape, label_vector('P16').shape)
torch.Size([7500, 40]) torch.Size([7500, 1])

I am currently having trouble using this dataset in a Network with mini-batches in the DataLoader.

Network architecture

import torch.nn as nn
import torch.nn.functional as F

class Network(nn.Module):
    def __init__(self, input_size, hidden_size):
      super(Network, self).__init__()
      self.dense_h1 = nn.Linear(in_features=input_size, out_features=hidden_size)
      self.relu_h1 = nn.ReLU()
      self.dropout = nn.Dropout(p=0.5)
      self.dense_out = nn.Linear(in_features=hidden_size, out_features=1)

    def forward(self, x):
      out = self.relu_h1(self.dense_h1(x))
      out = self.dropout(out)
      y_pred = self.dense_out(out)
      return y_pred

The rest of the code with the training sequence is as follows:

Initialization and training

use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True

# Parameters
params = {'batch_size': 3,
          'shuffle': True,
          'num_workers': 6}

learningRate = 1e-4
max_epochs = 100

# Model
input_size, hidden_size = 40, 20
model = Network(input_size, hidden_size)

if torch.cuda.is_available():

criterion = ConcordanceCorrelationCoefficient()
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)

# Datasets
partition = { 
    "train": ['P39', 'P23', 'P41', 'P46', 'P37', 'P16', 'P21', 'P25', 'P56'],
    "validation": ['P45', 'P26', 'P64', 'P34', 'P42', 'P65', 'P30', 'P19', 'P28']

# Generators
training_set = AudioDataset(partition['train'])
training_generator =, **params)

validation_set = AudioDataset(partition['validation'])
validation_generator =, **params)

# Loop over epochs
for epoch in range(max_epochs):
    # Training
    for local_batch, local_labels in training_generator:
        # Transfer to GPU
        local_batch, local_labels =,

        # Forward pass: Compute predicted y by passing x to the model
        outputs = model(local_batch)

        # Compute and print loss
        loss = criterion(outputs, local_labels)

        # Zero gradients, perform a backward pass, and update the weights.

    # Validation
    with torch.set_grad_enabled(False):
        for local_batch, local_labels in validation_generator:
            # Transfer to GPU
            local_batch, local_labels = local_batch.squeeze().to(device), local_labels.squeeze().to(device)

            # Forward pass: Compute predicted y by passing x to the model
            outputs = model(local_batch)

            # Compute and print loss
            loss = criterion(outputs, local_labels)
            print('Validation loss %.3f' % loss.item())

    print('Epoch: {}, Loss: {}'.format(epoch, loss.item()))

This sequence runs but the loss doesn’t really go down. I think it’s because the extra batch dimension creates a problem with my loss function, which is CCC. I have to use this metric for my research in addition to MSE.
torch.Size([3, 7500, 40]) torch.Size([3, 7500, 1])

Loss function

class ConcordanceCorrelationCoefficient(nn.Module):
    def __init__(self):
        super(ConcordanceCorrelationCoefficient, self).__init__()
        self.mean = torch.mean
        self.var = torch.var
        self.sum = torch.sum
        self.sqrt = torch.sqrt
        self.std = torch.std

    def forward(self, prediction, ground_truth):
        mean_gt = self.mean(ground_truth, 0)
        mean_pred = self.mean(prediction, 0)
        var_gt = self.var(ground_truth, 0)
        var_pred = self.var(prediction, 0)
        v_pred = prediction - mean_pred
        v_gt = ground_truth - mean_gt
        cor = self.sum (v_pred * v_gt) / (self.sqrt(self.sum(v_pred ** 2)) * self.sqrt(self.sum(v_gt ** 2)))
        sd_gt = self.std(ground_truth)
        sd_pred = self.std(prediction)
        numerator = 2 * cor * sd_gt * sd_pred
        denominator= var_gt + var_pred + (mean_gt - mean_pred) ** 2
        ccc = numerator / denominator
        return 1-ccc


  • Does it make sense to use each file as a different mini-batch or should I go about this in a different way? I could try to concatenate all feature sets into one batch.
  • How do I make the validation sequence more robust? Currently I only print the loss.
  • Are there other machine learning techniques that I should adopt or try?
  • Am I making it too complex for my use case? If so, what could I be doing differently?

Thanks for taking your time to read my topic, if you have any questions about my code or project, feel free to ask them.