I have a trained audio classifier network that I want to do live inference with. for that I to have a mel spectrogram “live stream” from the microphone. the only way that I can think of is recording the audio, saving it to a .wav file and then loading it back with torchaudio. is there a cleaner way to do it?
here’s what chatGPT came up with + some tweaks:
import torch
from torch import nn
from torch.nn import functional as F
import sounddevice as sd
import torchaudio
import numpy as np
def record_specgram():
# Parameters
sample_rate = 1200 # Sample rate
duration = 5 # Duration to record in seconds
print("Recording...")
# Record audio from microphone
audio = sd.rec(int(sample_rate * duration), samplerate=sample_rate, channels=1, dtype='float32')
sd.wait() # Wait until the recording is finished
print("Recording finished")
# Convert the NumPy array to a PyTorch tensor
waveform = torch.from_numpy(audio).squeeze()
# If necessary, transpose the waveform to fit torchaudio's expectations (shape: [channels, time])
# waveform = waveform.transpose(0, 1)
# Define the transformation
mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram()
# Generate the mel spectrogram
mel_spectrogram = mel_spectrogram_transform(waveform)
# half the sample rate
half_specgram = mel_spectrogram[:, ::2]
# pad
half_specgram = np.pad(half_specgram, [(0, 0), (174, 0)])
# replace zero values
half_specgram[half_specgram == 0] = 0.00001
half_specgram = np.log(half_specgram)
print("Mel spectrogram shape:", mel_spectrogram.shape)
print("half Mel spectrogram shape:", half_specgram.shape)
return torch.from_numpy(half_specgram)