it just takes two audio files and tries to differentiate between them
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
import torchaudio
import librosa
import os
import random
from tqdm import tqdm
def preProcess(path):
# gets path, returns tensor
# preprocess audio: resize to 1 channel, resample and cut the tensor
y, s = librosa.load(path, sr=800) # resample to 8kHz
tensor = torch.from_numpy(y)
tensor = torch.narrow(tensor, 0, 1, 5000) # cut the tensor to 5000
return tensor
# network:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"device: {device}")
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.L1 = nn.Linear(5000, 3000)
self.L2 = nn.Linear(3000, 1000)
self.L3 = nn.Linear(1000, 500)
self.L4 = nn.Linear(500, 250)
self.L5 = nn.Linear(250, 200)
self.L6 = nn.Linear(200, 100)
self.L7 = nn.Linear(100, 50)
self.L8 = nn.Linear(50, 2)
def forward(self, x):
x = torch.sigmoid(self.L1(x))
x = torch.sigmoid(self.L2(x))
x = torch.sigmoid(self.L3(x))
x = torch.sigmoid(self.L4(x))
x = torch.sigmoid(self.L5(x))
x = torch.sigmoid(self.L6(x))
x = torch.sigmoid(self.L7(x))
x = self.L8(x)
return x
net = Network().to(device)
optimizer = optim.SGD(net.parameters(), lr=0.01)
criterion = nn.MSELoss()
epochs = 100
loss_hist = []
net.train() # puts the network in "training mode" (idk)
for epoch in tqdm(range(1, epochs + 1)):
filePath = os.path.join(".\data", random.choice(os.listdir("./data"))) # get a random item from data directory
input = preProcess(filePath) # convert to tensor
fixedPath = filePath.replace("\\", "/") # the "\" causes problems
target = torch.tensor([0.0, 1.0]) if fixedPath == './data/allahu acbar.wav' else torch.tensor([1.0, 0.0]) # determine target according to file
optimizer.zero_grad()
out = net(input)
loss = criterion(out, target)
loss.backward()
optimizer.step()
loss_hist.append(loss.item())
plt.plot(loss_hist)
plt.show()
print(loss_hist[-1])