What can I do to improve this model's learning curve?

it just takes two audio files and tries to differentiate between them

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import torchaudio
import librosa
import os
import random
from tqdm import tqdm

def preProcess(path):
    # gets path, returns tensor
    # preprocess audio: resize to 1 channel, resample and cut the tensor
    y, s = librosa.load(path, sr=800)  # resample to 8kHz
    tensor = torch.from_numpy(y)
    tensor = torch.narrow(tensor, 0, 1, 5000)  # cut the tensor to 5000
    return tensor

# network:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")

class Network(nn.Module):
    def __init__(self):
        super(Network, self).__init__()
        self.L1 = nn.Linear(5000, 3000)
        self.L2 = nn.Linear(3000, 1000)
        self.L3 = nn.Linear(1000, 500)
        self.L4 = nn.Linear(500, 250)
        self.L5 = nn.Linear(250, 200)
        self.L6 = nn.Linear(200, 100)
        self.L7 = nn.Linear(100, 50)
        self.L8 = nn.Linear(50, 2)

    def forward(self, x):
        x = torch.sigmoid(self.L1(x))
        x = torch.sigmoid(self.L2(x))
        x = torch.sigmoid(self.L3(x))
        x = torch.sigmoid(self.L4(x))
        x = torch.sigmoid(self.L5(x))
        x = torch.sigmoid(self.L6(x))
        x = torch.sigmoid(self.L7(x))
        x = self.L8(x)
        return x

net = Network().to(device)
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.MSELoss()
epochs = 100

loss_hist = []
net.train()  # puts the network in "training mode" (idk)

for epoch in tqdm(range(1, epochs + 1)):
    filePath = os.path.join(".\data", random.choice(os.listdir("./data")))  # get a random item from data directory
    input = preProcess(filePath)  # convert to tensor

    fixedPath = filePath.replace("\\", "/")  # the "\" causes problems
    target = torch.tensor([0.0, 1.0]) if fixedPath == './data/allahu acbar.wav' else torch.tensor([1.0, 0.0])  # determine target according to file
    out = net(input)
    loss = criterion(out, target)



Working with raw waveforms and fully connected layers doesn’t really work.
Sigmoid activation everywhere is neither a good idea, relu is better.

The standard way is using STFT transform on the audio so that you get a time-frequency representation and then using an already-designed network for classification should work. Also ur sr=800 is not 8kHz, it’s 0.8 kHz which is roughly useless for anything as you are loosing any information. (8 kHz is already a bad quality). Even the loss function seems a bit strange.

I would look for a course in audio processing because you are faaaaar away from something suitable.

oh oops it used to be 8000, I forgot to change the comment. so you are suggesting me to start with something simpler?

If you are getting into deep learning I would go for images. Preprocessing for audio is a bit “tricky”.

If you still want to learn audio, then just look for a tutorial somewhere (like audio classification), as it’s not something to explain in a forum.