Model not Learning GTZAN dataset

Hello,

I’m creating this Topic because I need help to fix my Deep Learning model which is supposed to learn music genres with GTZAN dataset. I tried a lot of things but on validation set it is always around 10%/11% of accuracy and I am out of idea. To write my code I started with something that I’ve already done in one of my course to learn on the CIFAR10 dataset and tried to adapt it. I took a model to learn GTZAN that is supposed to work and I tried to create the loader by different means (loading all GTZAN, take GTZAN_train, val and test already created, cut my array to create a 3s sample etc.) but still nothing… Here is my code :


# Loading database 
gtzan = dset.GTZAN('Dataset', download=True)

gtzan_train = dset.GTZAN('Dataset', subset='training', download=True)

gtzan_val = dset.GTZAN('Dataset', subset='validation', download=True)

gtzan_test = dset.GTZAN('Dataset', subset='testing', download=True)

# Corresponding dictionnary between genre and number
Genres = {'blues' : 0, 'classical' : 1, 'country' : 2, 'disco' : 3, 'hiphop' : 4,
          'jazz' : 5, 'metal' : 6, 'pop' : 7, 'reggae' : 8, 'rock' : 9}

# Mean and variance which were computed
mu, std = -0.0010253926739096642, 0.15775565803050995
length_min = 660000

# Necessary because tensors don't have the same lengh
def resize(tensor, random=False):
  if (random):
    mask = torch.ones(tensor.numel(), dtype=torch.bool)
    indices = np.arange(tensor.numel())
    np.random.shuffle(indices)
    indices = indices[:tensor.numel()-lenght_min]
    mask[indices] = False
    return tensor[mask]
  else:
    return tensor[:length_min]



# Very simple class to work with our dataset
class MusicDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __getitem__(self, index):
        return self.data[index]

    def __len__(self):
        return len(self.data)



#-----------------Loading Datasets------------------
dataset = []
segment = length_min
batch_size = 10
print_every = 20

for music in gtzan:
  dataset += [(torch.stack([(resize(music[0][0])-mu)/(std**2)]), Genres[music[2]])]


Musics = MusicDataset(dataset)
loader_train = DataLoader(Musics, batch_size=batch_size, sampler=sampler.SubsetRandomSampler(range(440)))
loader_val = DataLoader(Musics, batch_size=batch_size, sampler=sampler.SubsetRandomSampler(range(440,640)))
loader_test = DataLoader(Musics, batch_size=batch_size, sampler=sampler.SubsetRandomSampler(range(660,1000)))

#-----------------Loading Datasets------------------
dataset_train = []
dataset_val = []
dataset_test = []
segment = length_min
batch_size = 10
print_every = 20

for music in gtzan_train:
  dataset_train += [(torch.stack([(resize(music[0][0])-mu)/(std**2)]), Genres[music[2]])]

for music in gtzan_val:
  dataset_val += [(torch.stack([(resize(music[0][0])-mu)/(std**2)]), Genres[music[2]])]

for music in gtzan_test:
  dataset_test += [(torch.stack([(resize(music[0][0])-mu)/(std**2)]), Genres[music[2]])]


#-----------------Loading Datasets------------------
# We take 3s of each sample to have more data to train on 
dataset_train = []
dataset_val = []
dataset_test = []
segment = length_min//10
batch_size = 64
print_every = 20

for music in gtzan_train:
  resized = (resize(music[0][0])-mu)/(std**2)
  audio_cut = [resized[i*segment:(i+1)*segment] for i in range(10)]
  for audio in audio_cut:
    dataset_train += [(torch.stack([audio]), Genres[music[2]])]

for music in gtzan_val:
  resized = (resize(music[0][0])-mu)/(std**2)
  audio_cut = [resized[i*segment:(i+1)*segment] for i in range(10)]
  for audio in audio_cut:
    dataset_val += [(torch.stack([audio]), Genres[music[2]])]

for music in gtzan_test:
  resized = (resize(music[0][0])-mu)/(std**2)
  audio_cut = [resized[i*segment:(i+1)*segment] for i in range(10)]
  for audio in audio_cut:
    dataset_test += [(torch.stack([audio]), Genres[music[2]])]



#------Creating Loaders----------
Musics_train = MusicDataset(dataset_train)
loader_train = DataLoader(Musics_train, batch_size=batch_size)

Musics_val = MusicDataset(dataset_val)
loader_val = DataLoader(Musics_val, batch_size=batch_size)

Musics_test = MusicDataset(dataset_test)
loader_test = DataLoader(Musics_test, batch_size=batch_size)

def flatten(x):
    N = x.shape[0] 
    return x.view(N, -1) 

def check_accuracy(loader, model):
    print("Checking accuracy") 
    num_correct = 0
    num_samples = 0
    model.eval()  # set model to evaluation mode
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype)  
            y = y.to(device=device, dtype=torch.long)

            scores = model.forward(x)
            prediction = torch.max(scores,1).indices
            num_samples += x.shape[0]
            num_correct += sum(prediction==y)

        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))

def run_val(loader, model):
    model.eval()
    loss = None
    with torch.no_grad():
        for x, y in loader:
            x = x.to(device=device, dtype=dtype) 
            y = y.to(device=device, dtype=torch.long)

            scores = model.forward(x)
            if (loss):
              loss += F.cross_entropy(scores,y).to(device=device)
            else : 
              loss = F.cross_entropy(scores,y).to(device=device)

    return loss

def train_module(model, optimizer, epochs=1):
    losses = {}
    losses_val = {}
    model = model.to(device=device) 
    for e in range(epochs):
        for t, (x, y) in enumerate(loader_train):
            model.train()  # put model to training mode
            x = x.to(device=device, dtype=dtype)  
            y = y.to(device=device, dtype=torch.long)

            scores = model(x)
            loss = F.cross_entropy(scores, y)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            if t % print_every == 0:
                print('Iteration %d, loss = %.4f' % (t, loss.item()))
                losses[e * len(loader_train) + t] = loss.item()
                check_accuracy(loader_val, model)
                print()
        
        loss_val = run_val(loader_val, model)
        losses_val[(e + 1) * len(loader_train)] = loss_val.item()

    return losses, losses_val

class Flatten(nn.Module):
    def forward(self, x):
        return flatten(x)

learning_rate = 0.001

model = nn.Sequential(
    Flatten(),
    nn.Linear(segment,512),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(512,256),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(256, 128),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(128, 64),
    nn.ReLU(),
    nn.Dropout(0.2),
    nn.Linear(64,10),  
)

optimizer = optim.Adam(model.parameters(),lr=learning_rate)

_ = train_module(model, optimizer, epochs=3)

Output:

Iteration 0, loss = 2.2613
Checking accuracy
Got 23 / 197 correct (11.68)

Iteration 20, loss = 9.7579
Checking accuracy
Got 22 / 197 correct (11.17)

Iteration 40, loss = 3.9026
Checking accuracy
Got 11 / 197 correct (5.58)

Iteration 0, loss = 64.3667
Checking accuracy
Got 17 / 197 correct (8.63)

Iteration 20, loss = 21.6154
Checking accuracy
Got 26 / 197 correct (13.20)

Iteration 40, loss = 12.0306
Checking accuracy
Got 20 / 197 correct (10.15)

Iteration 0, loss = 41.9825
Checking accuracy
Got 18 / 197 correct (9.14)

Iteration 20, loss = 9.2098
Checking accuracy
Got 28 / 197 correct (14.21)

Iteration 40, loss = 8.7790
Checking accuracy
Got 29 / 197 correct (14.72)

I also tried different learning rates and optimizer but none worked…
Plus, when I check my accuracy on my training set to see if my model do learn then I sometimes get 90% of accuracy after a few epochs (but not on my validation set) and sometimes don’t…
So, I’m really out of ideas now and if someone has clues on my problem it would really help me.