here is my code, I’m just trying to diffrentiate between two .wav files, for exercise. the final target for the project is a hotword detector:
import torch
import torchaudio
import matplotlib.pyplot as plt
import os
import torch.nn as nn
import soundfile as sf
from pydub import AudioSegment
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
AUDIO_DIR = './data/audio/'
IDX = 1
REUQESTED_FILE = str(IDX) + '.wav'
FilePath = os.path.join(str(AUDIO_DIR), str(REUQESTED_FILE))
def waveformToMelSpecrogram(wavFilePath):
waveform, sample_rate = torchaudio.load(wavFilePath)
waveform, sample_rate = sf.read(wavFilePath)
waveform = AudioSegment.from_mp3(wavFilePath)
waveform = waveform.set_channels(1)
waveform = waveform.get_array_of_samples()
waveform = torch.tensor(waveform, dtype=torch.float)
waveform = torch.reshape(waveform, (1, waveform.shape[0]))
mel_spectrogram = torchaudio.transforms.MelSpectrogram()(waveform)
return mel_spectrogram
class NN(nn.Module):
def __init__(self):
super(NN, self).__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(128 * 1756, 1500),
nn.ReLU(),
nn.Linear(1500, 1000),
nn.ReLU(),
nn.Linear(1000, 500),
nn.ReLU(),
nn.Linear(500, 2)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
specgram = waveformToMelSpecrogram(FilePath)
print(f"Shape of spectrogram: {specgram.size()}")
'''plt.figure()
plt.imshow(specgram.log2()[0,:,:].numpy())
plt.show()'''
net = NN().to(device)
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=1e-3)
expected = torch.tensor([1])
print(f"expected: {expected}")
loss = 9999999999
while loss != 0:
prediction = net(specgram)
print(f"prediction: {prediction}")
# Compute prediction error
loss = loss_fn(prediction, expected)
print(f"loss: {loss}")
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
the output:
C:\Users\hagit\AppData\Local\Microsoft\WindowsApps\python3.9.exe "C:/Users/hagit/OneDrive/שולחן העבודה/חננאל/neural networks/pytorch tutorials/speech recognition.py"
C:\Users\hagit\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\pydub\utils.py:170: RuntimeWarning: Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work
warn("Couldn't find ffmpeg or avconv - defaulting to ffmpeg, but may not work", RuntimeWarning)
Using cpu device
C:\Users\hagit\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\torchaudio\functional\functional.py:432: UserWarning: At least one mel filterbank has all zero values. The value for `n_mels` (128) may be set too high. Or, the value for `n_freqs` (201) may be set too low.
warnings.warn(
Shape of spectrogram: torch.Size([1, 128, 1756])
expected: tensor([1])
prediction: tensor([[ 1.1202e+09, -2.1888e+09]], grad_fn=<AddmmBackward>)
loss: 3309037824.0
prediction: tensor([[-1.0482e+12, 1.1267e+12]], grad_fn=<AddmmBackward>)
loss: 0.0
Process finished with exit code 0