Advice for improving model accuracy with music classification

I’m building a model to classify the ‘difficulty’ of guitar music from midi files. During evaluation, my model consistently reaches poor accuracy values (~37%) and during prediction my model will consistently predict wrong (often predicts different pieces at the same grade.)

Below is the relevant code with my training and preprocessing:

def seq_prep(sequences):
    network_input = []
    for sequence in sequences:
        translated_seq = []t
        translated_seq.append([notes_to_num[item] for item in sequence])
    return network_input

class DifficultyClassifer(nn.Module):
    def __init__(self, input_size, num_classes, hidden_size=384):
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers=7, batch_first=True, dropout=0.4, bidirectional=True)
        self.fc1 = nn.Linear(hidden_size*2, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, 128)
        self.relu2 = nn.ReLU()
        self.fc3 = nn.Linear(128, 64)
        self.relu3 = nn.ReLU()
        self.fc4 = nn.Linear(64, num_classes)
    def forward(self, x):
        out, _ = self.lstm(x)
        out = self.fc1(out)
        out = self.relu(out)
        out = self.fc2(out)
        out = self.relu2(out)
        out = self.fc3(out)
        out = self.relu3(out)
        out = self.fc4(out)
        return out

test = []
def split(sequences):
    sequences = seq_prep(sequences)
    # print(sequences)
    padded_sequences = pad_sequence([torch.tensor(seq) for seq in sequences], batch_first=True, padding_value=-1)

    padded_sequences = (padded_sequences) / (n_vocab)

    grades_tensor = torch.tensor(grades, dtype=torch.long) #converts to tensor
    train_sequences, test_sequences, train_grades, test_grades = train_test_split(padded_sequences, grades_tensor, test_size=0.01)

    train_loader = DataLoader(list(zip(train_sequences, train_grades)), shuffle=False, batch_size=batch_size)
    test_loader = DataLoader(list(zip(test_sequences, test_grades)), shuffle=False, batch_size=batch_size)
    return train_loader, test_loader

def train_network(train_loader):   
    model = DifficultyClassifer(input_size=max_length, num_classes=9)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=25, verbose=True)

    num_epochs = 200

    start_time = time.time()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}\n-------------------------------")
        for batch, (input_batch, output_batch) in enumerate(train_loader):
            output = model(input_batch)

            output_batch = output_batch.view(-1)

            loss = criterion(output, output_batch)
            if batch % 25 == 0:
                print(f"Batch: {batch} - Loss: {loss.item():>7f}")
    end_time = time.time()-start_time
    print(f"Time taken: {end_time}"), "full_difficulty_pred.pth")

def eval_network(val_loader):
    model = DifficultyClassifer(input_size=max_length, num_classes=9)
    model_dict = torch.load('full_difficulty_pred.pth')

    total_correct = 0
    total_samples = 0

    all_true_labels = []
    all_predicted_labels = []
    with torch.no_grad():
        for val_batch, (val_input_batch, val_output_batch) in enumerate(val_loader):
            val_output = model(val_input_batch)
            print(f"RAW VALID OUTPUT: {val_output}")
            val_output_batch = val_output_batch.view(-1)
            # print("EXPECTED GRADE: ", val_output_batch)
            _, predicted_class = torch.max(val_output, 1)
            # print(f"PREDICTED CLASS: {predicted_class.item()}")

            # print(f"VAL TRUE CLASS : {val_output_batch}")
            # print(f"VAL PREDICTED CLASS : {predicted_class}")
            total_correct += (predicted_class == val_output_batch).sum().item()
            total_samples += val_output_batch.size(0)

    accuracy = accuracy_score(all_true_labels, all_predicted_labels)
    precision = precision_score(all_true_labels, all_predicted_labels, average='weighted')
    recall = recall_score(all_true_labels, all_predicted_labels, average='weighted')
    f1 = f1_score(all_true_labels, all_predicted_labels, average='weighted')

    print(f'Accuracy: {accuracy * 100:.2f}%')
    print(f'Precision: {precision:.2f}')
    print(f'Recall: {recall:.2f}')
    print(f'F1-Score: {f1:.2f}')

I’ve double checked that the pre-processing of the training/validation and prediction data is the same, along with checking my validation method to ensure it’s not giving me a false accuracy. I’m not sure what the problem could be.

I made the dataset myself and it is not feasible for me to increase the amount of data any further. Each class has a balanced amount of data.
I’ve tried to incrementally increase the complexity of my model alongside training for longer but to no avail and now I’m not sure what I can do in attempt to improve my results.

I would appreciate in any help improving the accuracy of my model. Thank you!