What would be the "correct" way to classify live audio?

I have a trained audio classifier network that I want to do live inference with. for that I to have a mel spectrogram “live stream” from the microphone. the only way that I can think of is recording the audio, saving it to a .wav file and then loading it back with torchaudio. is there a cleaner way to do it?

here’s what chatGPT came up with + some tweaks:

import torch
from torch import nn
from torch.nn import functional as F
import sounddevice as sd
import torchaudio
import numpy as np

def record_specgram():
    # Parameters
    sample_rate = 1200  # Sample rate
    duration = 5  # Duration to record in seconds


    # Record audio from microphone
    audio = sd.rec(int(sample_rate * duration), samplerate=sample_rate, channels=1, dtype='float32')
    sd.wait()  # Wait until the recording is finished

    print("Recording finished")

    # Convert the NumPy array to a PyTorch tensor
    waveform = torch.from_numpy(audio).squeeze()

    # If necessary, transpose the waveform to fit torchaudio's expectations (shape: [channels, time])
    # waveform = waveform.transpose(0, 1)

    # Define the transformation
    mel_spectrogram_transform = torchaudio.transforms.MelSpectrogram()

    # Generate the mel spectrogram
    mel_spectrogram = mel_spectrogram_transform(waveform)

    # half the sample rate
    half_specgram = mel_spectrogram[:, ::2]

    # pad
    half_specgram = np.pad(half_specgram, [(0, 0), (174, 0)])

    # replace zero values
    half_specgram[half_specgram == 0] = 0.00001
    half_specgram = np.log(half_specgram)

    print("Mel spectrogram shape:", mel_spectrogram.shape)
    print("half Mel spectrogram shape:", half_specgram.shape)

    return torch.from_numpy(half_specgram)