My loss does not decrease at all

Hello!
I am currently working on music emotion recognition task. My input data has 619 songs and corresponding arousal values and valence values (for 30 Hz and the sampling freq of the annotation is 2Hz: 60 annotations in total) . So, I preprocessed my dataset (converted the songs into mel spectrograms). And each mel-spec has a shape of (N19660) (batchchannelnmelsframes). I succeeded to run my code below and I plotted the smoothed loss value of each epoc. But it never decreases at all. Is there anything I am doing wrong?
I am pretty new to deep learning so I do not have much knowledge. Thanks in advance.

#
# @author: Yoon mo Yang 
#
from __future__ import print_function, division, absolute_import, unicode_literals
import six
import os

import numpy as np
import pandas as pd
import h5py
import torch
import torchvision.transforms as transforms 
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import matplotlib.pyplot as plt
import audio_processor as ap

class DataforTorch(torch.utils.data.Dataset):
	def __init__(self, path):
		super(DataforTorch,self).__init__()
		f = h5py.File(path, 'r')
		self.audio = f.get('audio')
		self.arousal = f.get('arousal')
		self.valence = f.get('valence')

	def __getitem__(self,index):
		resultDict = dict()
		resultDict['audio'] = torch.FloatTensor(self.audio[index,:,:,:])
		resultDict['arousal'] = torch.FloatTensor(self.arousal[index,:])
		resultDict['valence'] = torch.FloatTensor(self.valence[index,:])
		return resultDict

	def __len__(self):
		return self.audio.shape[0]

class MerSCNN(nn.Module):
	def __init__(self, batch_size = 32):
		super(MerSCNN,self).__init__()
		self.batch_size = batch_size
		#########################################
        #        layers are defined below       #
        #########################################
        # convolutional layer 1 + maxpooling layer 1
		self.clayer1 = nn.Sequential(
			nn.Conv2d(1, 8, kernel_size=[1,1]),
			nn.ReLU(),
			nn.BatchNorm2d(8),
			torch.nn.MaxPool2d(kernel_size=[2,1]))
		# convolutional layer 2 + maxpooling layer 2
		self.clayer2 = nn.Sequential(
			nn.Conv2d(8,16, kernel_size=[1,1]),
			nn.ReLU(),
			nn.BatchNorm2d(16),
			torch.nn.MaxPool2d(kernel_size=[2,1]))
		# fully connected layers (1 and 2)
		self.fc1 = nn.Linear(16*24*60,5760)
		self.t1 = nn.Tanh()
		self.fc2 = nn.Linear(5760,1440)
		self.t2 = nn.Tanh()
		self.fc3 = nn.Linear(1440,60)
		self.t3 = nn.Tanh()
		self.initParams()

	def initParams(self):
		for param in self.parameters():
			if len(param.shape)>1:
				torch.nn.init.xavier_normal_(param)

	def forward(self,x):

		c = self.clayer1(x)
		a_c = self.clayer2(c)
		a_c = a_c.view(-1, self.num_flat_features(a_c))
		a_fc1 = self.t1(self.fc1(a_c))
		a_fc2 = self.t2(self.fc2(a_fc1))
		a_fc3 = self.t3(self.fc3(a_fc2))
		a_hat = F.dropout(a_fc3,p=0.025)
		return a_hat

	def num_flat_features(self,x):
		size = x.size()[1:]
		num_features = 1
		for s in size:
			num_features *= s
		return num_features

if __name__ == "__main__":
	# For loss function
	eps = 1e-6
	eta = 0.999

	#how many audio files to process fetched at each time: 32 files
	batch_size= 32 

	#path to save the model
	savedFilename_a = "savedModel_MerSCNN.pt"

	dataset = DataforTorch("dfile.hdf5")
	dataloader = torch.utils.data.DataLoader(dataset, batch_size = batch_size,shuffle=True, num_workers = 2) #not sure how to deal with collate_fn yet
    #initialize the Model
	model_a = MerSCNN(batch_size).cuda()
  
    #if you want to restore your previous saved model
	if os.path.exists(savedFilename_a):
		model_a.load_state_dict(torch.load(savedFilename_a))


    #determine if cuda is available
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model_a.to(device)


	optimizer_a = torch.optim.Adam(model_a.parameters(), lr=1e-3, eps = eps)
	model_a.train(mode=True)

	nepoc = 20
	a_loss_sm = 0
	criterion = nn.MSELoss()
	a_loss_trd = []

	for epoc in tqdm(range(nepoc)):
        #Each time we fetch a batch of samples from the dataloader
		for idx, sample in enumerate(dataloader):
            #Remember to clear the accumulated gradient each time you perfrom optimizer.step() : zero the gradient buffers
			model_a.zero_grad()
			audio = sample['audio'].to(device)
			arousal = sample['arousal'].to(device)
			a_hat = model_a(audio)


            # loss function computation
			a_loss = torch.sqrt(criterion(a_hat,arousal)) # RMSE loss
			optimizer_a.zero_grad()
			a_loss.backward() # backward : backprop
			optimizer_a.step() # Does the update
			# loss function analyze
			a_loss_sm = eta*a_loss_sm+(1-eta)*(a_loss.detach().cpu().numpy())

		a_loss_trd.append(a_loss_sm)
		#save the model to savedFilename
		torch.save(model_a.state_dict(),savedFilename_a)
		# torch.save(model_v.state_dict(),savedFilename_v)
	
	plt.figure()
	plt.plot(a_loss_trd) 
	plt.title("arousal loss trend")
	plt.savefig("loss_trend.png")
	plt.gcf().clear()
	print('finished whole training')