I have a dataset that consists of 18 audio files with a duration of 5 minutes, annotated with affective labels (arousal) per timestep (25hz or per 0.04s). 9 of these files are used for evaluating performance.
For each audio file I create a feature vector with MFCC (20 bins) and MFCC deltas (also 20) per labeled timestep with overlapping.
Dataset
def mfcc_features(filename, n_mfcc=20, n_mels=128, frame_time=0.08, hop_time=0.04):
filepath = os.path.join(gf.audio_path[0], filename+'.wav')
waveform, sample_rate = librosa.load(filepath, sr=None)
frame_length = int(sample_rate * frame_time)
hop_length = int(sample_rate * hop_time)
melkwargs = {"n_fft" : frame_length, "n_mels" : n_mels, "hop_length": hop_length,
"f_min" : 0, "f_max" : None, "window_fn" : torch.hamming_window}
mfcc = torchaudio.transforms.MFCC(sample_rate=sample_rate, n_mfcc=n_mfcc, dct_type=2, norm='ortho', log_mels=True, melkwargs=melkwargs)(torch.from_numpy(waveform))[:,:-1]
mfcc_deltas = torchaudio.functional.compute_deltas(mfcc, win_length=3)
feature_vector = torch.cat([mfcc, mfcc_deltas])
return torch.FloatTensor(feature_vector).T
def label_vector(filename, target_value='arousal'):
target_values = ['arousal', 'valence']
if target_value not in target_values:
raise ValueError("Invalid target value. Expected one of: %s" % target_values)
filepath = os.path.join(gf.gold_standard_path[target_values.index(target_value)], filename+'.csv')
df = pd.read_csv(filepath)
return torch.FloatTensor(df['gold_standard'].values).unsqueeze(0).T
class AudioDataset(torch.utils.data.Dataset):
def __init__(self, list_IDs):
self.list_IDs = list_IDs
def __len__(self):
return len(self.list_IDs)
def __getitem__(self, index):
# Select sample
ID = self.list_IDs[index]
# Load data and get label
X = mfcc_features(ID)
y = label_vector(ID)
return X, y
Therefore I will have an X and y for each file with the following shape:
$ print(mfcc_features('P16').shape, label_vector('P16').shape)
torch.Size([7500, 40]) torch.Size([7500, 1])
I am currently having trouble using this dataset in a Network with mini-batches in the DataLoader.
Network architecture
import torch.nn as nn
import torch.nn.functional as F
class Network(nn.Module):
def __init__(self, input_size, hidden_size):
super(Network, self).__init__()
self.dense_h1 = nn.Linear(in_features=input_size, out_features=hidden_size)
self.relu_h1 = nn.ReLU()
self.dropout = nn.Dropout(p=0.5)
self.dense_out = nn.Linear(in_features=hidden_size, out_features=1)
def forward(self, x):
out = self.relu_h1(self.dense_h1(x))
out = self.dropout(out)
y_pred = self.dense_out(out)
return y_pred
The rest of the code with the training sequence is as follows:
Initialization and training
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
# Parameters
params = {'batch_size': 3,
'shuffle': True,
'num_workers': 6}
learningRate = 1e-4
max_epochs = 100
# Model
input_size, hidden_size = 40, 20
model = Network(input_size, hidden_size)
if torch.cuda.is_available():
model.cuda()
criterion = ConcordanceCorrelationCoefficient()
optimizer = torch.optim.Adam(model.parameters(), lr=learningRate)
# Datasets
partition = {
"train": ['P39', 'P23', 'P41', 'P46', 'P37', 'P16', 'P21', 'P25', 'P56'],
"validation": ['P45', 'P26', 'P64', 'P34', 'P42', 'P65', 'P30', 'P19', 'P28']
}
# Generators
training_set = AudioDataset(partition['train'])
training_generator = torch.utils.data.DataLoader(training_set, **params)
validation_set = AudioDataset(partition['validation'])
validation_generator = torch.utils.data.DataLoader(validation_set, **params)
# Loop over epochs
for epoch in range(max_epochs):
# Training
for local_batch, local_labels in training_generator:
# Transfer to GPU
local_batch, local_labels = local_batch.to(device), local_labels.to(device)
# Forward pass: Compute predicted y by passing x to the model
outputs = model(local_batch)
# Compute and print loss
loss = criterion(outputs, local_labels)
# Zero gradients, perform a backward pass, and update the weights.
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Validation
with torch.set_grad_enabled(False):
for local_batch, local_labels in validation_generator:
# Transfer to GPU
local_batch, local_labels = local_batch.squeeze().to(device), local_labels.squeeze().to(device)
# Forward pass: Compute predicted y by passing x to the model
outputs = model(local_batch)
# Compute and print loss
loss = criterion(outputs, local_labels)
print('Validation loss %.3f' % loss.item())
print('Epoch: {}, Loss: {}'.format(epoch, loss.item()))
This sequence runs but the loss doesn’t really go down. I think it’s because the extra batch dimension creates a problem with my loss function, which is CCC. I have to use this metric for my research in addition to MSE.
torch.Size([3, 7500, 40]) torch.Size([3, 7500, 1])
Loss function
class ConcordanceCorrelationCoefficient(nn.Module):
def __init__(self):
super(ConcordanceCorrelationCoefficient, self).__init__()
self.mean = torch.mean
self.var = torch.var
self.sum = torch.sum
self.sqrt = torch.sqrt
self.std = torch.std
def forward(self, prediction, ground_truth):
mean_gt = self.mean(ground_truth, 0)
mean_pred = self.mean(prediction, 0)
var_gt = self.var(ground_truth, 0)
var_pred = self.var(prediction, 0)
v_pred = prediction - mean_pred
v_gt = ground_truth - mean_gt
cor = self.sum (v_pred * v_gt) / (self.sqrt(self.sum(v_pred ** 2)) * self.sqrt(self.sum(v_gt ** 2)))
sd_gt = self.std(ground_truth)
sd_pred = self.std(prediction)
numerator = 2 * cor * sd_gt * sd_pred
denominator= var_gt + var_pred + (mean_gt - mean_pred) ** 2
ccc = numerator / denominator
return 1-ccc
Questions
- Does it make sense to use each file as a different mini-batch or should I go about this in a different way? I could try to concatenate all feature sets into one batch.
- How do I make the validation sequence more robust? Currently I only print the loss.
- Are there other machine learning techniques that I should adopt or try?
- Am I making it too complex for my use case? If so, what could I be doing differently?
Thanks for taking your time to read my topic, if you have any questions about my code or project, feel free to ask them.