Hi,
I just read the following in torchaudio
docs for both LFCC docs and MFCC docs:
This output depends on the maximum value in the input spectrogram, and so may return different values for an audio clip split into snippets vs. a a full clip.
My guess is that it is related with the normalized
argument of Spectrogram that both LFCC and MFCC internally use (although it should default to False
). I also noticed that create_dict has a norm
argument, but it does not depend on the input spectrogram.
That being said, I actually need to perform LFCC frame by frame during inference and even after deactivating all the possible dependencies on the input spectrogram such as those mentioned above, I still get precision errors somewhere between 1e-5
and 1e-7
approx. for the calculated LFCC coefficients and I wanted to know if I may be neglecting other sources that may cause this difference or it is expected. Please find the related code here:
import torch
import torch.nn as nn
from typing import Callable, List, Tuple
from torchaudio.transforms import LFCC
class Transformations(nn.Module):
def __init__(self, sample_rate: int = 16000, num_lfcc: int = 60,
fft_size: int = 512, hop_size: int = 256,
fbank_size: int = 128, fft_center: bool = False,
window_fn: Callable = torch.hamming_window):
super().__init__()
# Params
self.sample_rate = sample_rate
self.num_lfcc = num_lfcc
self.fbank_size = fbank_size
self.speckwargs = {
"n_fft": fft_size,
"power": 2.0,
"onesided": True,
"window_fn": window_fn,
"center": fft_center,
"hop_length": hop_size,
"normalized": False,
}
# Modules
self.lfcc = LFCC(sample_rate=self.sample_rate,
n_lfcc=self.num_lfcc, n_filter=self.fbank_size,
log_lf=True, speckwargs=self.speckwargs, norm="ortho")
def forward(self, x: torch.Tensor) -> torch.Tensor:
return self.lfcc(x)
def compare_tensors(trials: List[Tuple[str, Tuple[torch.Tensor]]], rtol=0e-5,
atol=0e-8):
for idx, (n, a, b) in enumerate(trials):
equal = torch.equal(a, b)
all_close = torch.allclose(a, b, rtol=rtol, atol=atol)
# Print results
print(f"==================== Trial {idx}: '{n}' ====================")
print(f"a dims: {a.size()}; b dims: {b.size()}")
print(f"Are a and b equal?: {equal}")
print(f"Are a and b all close?: {all_close}")
if not equal or not all_close:
print(f"Boolean pattern:\n{a == b}")
print(f"Difference pattern:\n{a - b}")
if __name__ == "__main__":
# Audio params
frame_size = 512
hop_size = 256
num_frames = 3
dtype = torch.float32
# rtol = 1e-5
# atol = 1e-8
rtol = 0.0
atol = 1e-5
# Dummy tensor
x = torch.randn((1, 1, (frame_size) + hop_size * (num_frames - 1)),
dtype=dtype)
# Split into frames
x_0 = x.clone()[..., :frame_size]
x_1 = x.clone()[..., hop_size:(frame_size + hop_size)]
x_2 = x.clone()[..., (hop_size * 2):(hop_size * 2 + frame_size)]
# Transformation
transformations = Transformations()
# Transform the whole tensor
x_lfcc = transformations(x)
# Transform frame by frame
x_lfcc_0 = transformations(x_0)
x_lfcc_1 = transformations(x_1)
x_lfcc_2 = transformations(x_2)
# Print comparison
compare_tensors([
# Input frames
("frame_0", x[..., :frame_size], x_0),
("frame_1", x[..., hop_size:(hop_size + frame_size)], x_1),
("frame_2", x[..., (hop_size * 2):(hop_size * 2 + frame_size)], x_2),
# LFCC of whole audio vs frames
("lfcc_frame_0", x_lfcc[..., 0], x_lfcc_0[..., 0]),
("lfcc_frame_1", x_lfcc[..., 1], x_lfcc_1[..., 0]),
("lfcc_frame_2", x_lfcc[..., 2], x_lfcc_2[..., 0])
], rtol=rtol, atol=atol)
Output:
==================== Trial 0: 'frame_0' ====================
a dims: torch.Size([1, 1, 512]); b dims: torch.Size([1, 1, 512])
Are a and b equal?: True
Are a and b all close?: True
==================== Trial 1: 'frame_1' ====================
a dims: torch.Size([1, 1, 512]); b dims: torch.Size([1, 1, 512])
Are a and b equal?: True
Are a and b all close?: True
==================== Trial 2: 'frame_2' ====================
a dims: torch.Size([1, 1, 512]); b dims: torch.Size([1, 1, 512])
Are a and b equal?: True
Are a and b all close?: True
==================== Trial 3: 'lfcc_frame_0' ====================
a dims: torch.Size([1, 1, 60]); b dims: torch.Size([1, 1, 60])
Are a and b equal?: False
Are a and b all close?: True
Boolean pattern:
tensor([[[False, False, False, False, False, False, False, True, False, False,
False, False, False, True, False, False, False, False, True, False,
False, False, False, False, False, False, False, False, True, False,
False, False, False, False, False, False, False, True, False, False,
False, False, True, False, False, False, False, False, False, False,
True, False, False, False, False, False, False, True, False, False]]])
Difference pattern:
tensor([[[ 7.6294e-06, 8.3447e-07, 4.4703e-08, 1.7881e-07, -3.5763e-07,
-3.5763e-07, 7.1526e-07, 0.0000e+00, -1.1921e-07, -2.3842e-07,
5.9605e-08, 2.3842e-07, -2.3842e-07, 0.0000e+00, -5.9605e-08,
-1.1921e-07, 5.9605e-07, -2.3842e-07, 0.0000e+00, -5.9605e-08,
-3.5763e-07, -1.7881e-07, -2.3842e-07, 2.9802e-07, -1.1921e-07,
1.1921e-07, 2.0862e-07, -1.4901e-07, 0.0000e+00, -1.1921e-07,
-2.3842e-07, 2.9802e-07, -1.9073e-06, 1.7881e-07, 1.7881e-07,
-1.7881e-07, -5.9605e-08, 0.0000e+00, -1.1921e-07, -3.5763e-07,
1.7881e-07, -1.1921e-07, 0.0000e+00, -2.3842e-07, -1.7881e-07,
1.1921e-07, -1.1921e-07, 5.9605e-07, -9.5367e-07, 5.3644e-07,
0.0000e+00, -4.7684e-07, 1.1921e-07, 1.1921e-07, -1.1921e-07,
2.3842e-07, -8.9407e-08, 0.0000e+00, 4.1723e-07, 5.9605e-08]]])
==================== Trial 4: 'lfcc_frame_1' ====================
a dims: torch.Size([1, 1, 60]); b dims: torch.Size([1, 1, 60])
Are a and b equal?: False
Are a and b all close?: True
Boolean pattern:
tensor([[[ True, False, False, False, False, False, False, False, False, False,
False, False, False, True, False, False, False, False, False, False,
True, False, False, False, False, False, False, False, False, False,
False, False, False, False, False, True, False, True, True, False,
False, False, False, False, False, False, False, False, False, False,
False, False, False, False, False, True, False, True, False, False]]])
Difference pattern:
tensor([[[ 0.0000e+00, -9.5367e-07, -5.9605e-07, 3.5763e-07, -5.9605e-08,
-1.1921e-07, -2.9802e-07, -4.7684e-07, -1.7881e-07, 1.1921e-07,
2.0862e-07, -5.9605e-08, -1.7881e-07, 0.0000e+00, -3.5763e-07,
-5.9605e-07, -5.9605e-07, -7.1526e-07, 7.4506e-08, -4.1723e-07,
0.0000e+00, 2.9802e-08, 2.0862e-07, -1.7881e-07, -5.9605e-08,
-2.6822e-07, 2.9802e-08, 2.3842e-07, 2.9802e-07, 3.5763e-07,
1.7881e-07, -5.9605e-07, 4.7684e-07, 5.9605e-08, -2.3842e-07,
0.0000e+00, -2.6822e-07, 0.0000e+00, 0.0000e+00, 1.1921e-07,
5.9605e-08, 2.9802e-08, 1.7881e-07, 1.4901e-07, -1.1921e-07,
1.4901e-07, -5.9605e-08, -1.7881e-07, -2.3842e-07, -4.7684e-07,
-2.9802e-07, 6.5565e-07, 2.9802e-07, 3.5763e-07, -5.9605e-08,
0.0000e+00, -1.4901e-07, 0.0000e+00, 2.3842e-07, 4.1723e-07]]])
==================== Trial 5: 'lfcc_frame_2' ====================
a dims: torch.Size([1, 1, 60]); b dims: torch.Size([1, 1, 60])
Are a and b equal?: False
Are a and b all close?: True
Boolean pattern:
tensor([[[False, False, False, False, True, False, False, False, False, False,
False, False, False, True, False, False, False, False, False, False,
False, False, False, False, False, False, True, False, False, False,
False, False, False, True, False, True, False, False, False, False,
False, False, False, False, False, False, False, False, False, False,
False, True, False, False, False, False, False, True, False, False]]])
Difference pattern:
tensor([[[ 7.6294e-06, -1.1921e-06, 1.2517e-06, -2.9802e-07, 0.0000e+00,
2.3842e-07, 8.3447e-07, 1.1921e-07, -2.8312e-07, -2.9802e-08,
-2.0862e-07, -1.1921e-07, -1.1921e-07, 0.0000e+00, 4.7684e-07,
9.5367e-07, 1.7881e-07, 1.7881e-07, -2.6822e-07, -8.9407e-08,
-2.3842e-07, 1.7881e-07, 5.9605e-08, 5.9605e-08, -2.3842e-07,
6.7055e-08, 0.0000e+00, -2.9802e-07, 2.3842e-07, -1.1921e-07,
1.4901e-08, 1.1921e-07, 2.0266e-06, 0.0000e+00, 2.3842e-07,
0.0000e+00, -1.1921e-07, -5.9605e-08, 1.7881e-07, 3.2783e-07,
2.9802e-07, -1.1921e-07, -4.4703e-08, 1.4901e-07, 5.9605e-08,
2.3842e-07, 4.1723e-07, -1.1921e-07, 5.3644e-07, 1.7881e-07,
-5.9605e-08, 0.0000e+00, 2.3842e-07, -1.7881e-07, -1.7881e-07,
1.7881e-07, 4.7684e-07, 0.0000e+00, -5.9605e-08, -4.7684e-07]]])
Thanks for your help