Hello!
I am currently working on music emotion recognition task. My input data has 619 songs and corresponding arousal values and valence values (for 30 Hz and the sampling freq of the annotation is 2Hz: 60 annotations in total) . So, I preprocessed my dataset (converted the songs into mel spectrograms). And each mel-spec has a shape of (N19660) (batchchannelnmelsframes). I succeeded to run my code below and I plotted the smoothed loss value of each epoc. But it never decreases at all. Is there anything I am doing wrong?
I am pretty new to deep learning so I do not have much knowledge. Thanks in advance.
#
# @author: Yoon mo Yang
#
from __future__ import print_function, division, absolute_import, unicode_literals
import six
import os
import numpy as np
import pandas as pd
import h5py
import torch
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
import audio_processor as ap
class DataforTorch(torch.utils.data.Dataset):
def __init__(self, path):
super(DataforTorch,self).__init__()
f = h5py.File(path, 'r')
self.audio = f.get('audio')
self.arousal = f.get('arousal')
self.valence = f.get('valence')
def __getitem__(self,index):
resultDict = dict()
resultDict['audio'] = torch.FloatTensor(self.audio[index,:,:,:])
resultDict['arousal'] = torch.FloatTensor(self.arousal[index,:])
resultDict['valence'] = torch.FloatTensor(self.valence[index,:])
return resultDict
def __len__(self):
return self.audio.shape[0]
class MerSCNN(nn.Module):
def __init__(self, batch_size = 32):
super(MerSCNN,self).__init__()
self.batch_size = batch_size
#########################################
# layers are defined below #
#########################################
# convolutional layer 1 + maxpooling layer 1
self.clayer1 = nn.Sequential(
nn.Conv2d(1, 8, kernel_size=[1,1]),
nn.ReLU(),
nn.BatchNorm2d(8),
torch.nn.MaxPool2d(kernel_size=[2,1]))
# convolutional layer 2 + maxpooling layer 2
self.clayer2 = nn.Sequential(
nn.Conv2d(8,16, kernel_size=[1,1]),
nn.ReLU(),
nn.BatchNorm2d(16),
torch.nn.MaxPool2d(kernel_size=[2,1]))
# fully connected layers (1 and 2)
self.fc1 = nn.Linear(16*24*60,5760)
self.t1 = nn.Tanh()
self.fc2 = nn.Linear(5760,1440)
self.t2 = nn.Tanh()
self.fc3 = nn.Linear(1440,60)
self.t3 = nn.Tanh()
self.initParams()
def initParams(self):
for param in self.parameters():
if len(param.shape)>1:
torch.nn.init.xavier_normal_(param)
def forward(self,x):
c = self.clayer1(x)
a_c = self.clayer2(c)
a_c = a_c.view(-1, self.num_flat_features(a_c))
a_fc1 = self.t1(self.fc1(a_c))
a_fc2 = self.t2(self.fc2(a_fc1))
a_fc3 = self.t3(self.fc3(a_fc2))
a_hat = F.dropout(a_fc3,p=0.025)
return a_hat
def num_flat_features(self,x):
size = x.size()[1:]
num_features = 1
for s in size:
num_features *= s
return num_features
if __name__ == "__main__":
# For loss function
eps = 1e-6
eta = 0.999
#how many audio files to process fetched at each time: 32 files
batch_size= 32
#path to save the model
savedFilename_a = "savedModel_MerSCNN.pt"
dataset = DataforTorch("dfile.hdf5")
dataloader = torch.utils.data.DataLoader(dataset, batch_size = batch_size,shuffle=True, num_workers = 2) #not sure how to deal with collate_fn yet
#initialize the Model
model_a = MerSCNN(batch_size).cuda()
#if you want to restore your previous saved model
if os.path.exists(savedFilename_a):
model_a.load_state_dict(torch.load(savedFilename_a))
#determine if cuda is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_a.to(device)
optimizer_a = torch.optim.Adam(model_a.parameters(), lr=1e-3, eps = eps)
model_a.train(mode=True)
nepoc = 20
a_loss_sm = 0
criterion = nn.MSELoss()
a_loss_trd = []
for epoc in tqdm(range(nepoc)):
#Each time we fetch a batch of samples from the dataloader
for idx, sample in enumerate(dataloader):
#Remember to clear the accumulated gradient each time you perfrom optimizer.step() : zero the gradient buffers
model_a.zero_grad()
audio = sample['audio'].to(device)
arousal = sample['arousal'].to(device)
a_hat = model_a(audio)
# loss function computation
a_loss = torch.sqrt(criterion(a_hat,arousal)) # RMSE loss
optimizer_a.zero_grad()
a_loss.backward() # backward : backprop
optimizer_a.step() # Does the update
# loss function analyze
a_loss_sm = eta*a_loss_sm+(1-eta)*(a_loss.detach().cpu().numpy())
a_loss_trd.append(a_loss_sm)
#save the model to savedFilename
torch.save(model_a.state_dict(),savedFilename_a)
# torch.save(model_v.state_dict(),savedFilename_v)
plt.figure()
plt.plot(a_loss_trd)
plt.title("arousal loss trend")
plt.savefig("loss_trend.png")
plt.gcf().clear()
print('finished whole training')