CNN speech recognition model good train/test accuracy yet poor inference on outside data?

My speech recognition model below reports > 90% test & train accuracies after 5 epochs thus infers good on the training/test datasets yet its prediction is not correct on any data outside of these sets. Why is this happening with such high reported accuracy?

import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os
from tqdm import tqdm
import numpy as np

if torch.backends.mps.is_available() and torch.backends.mps.is_built():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print('device:', device)

dataPath = '../dataset/mini_speech_commands'

classes = ('down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes')
num_classes = len(classes)


class mfccDS(Dataset):
    def __init__(self, data_dir):
        self.ds = data_dir

    # The length of the dataset
    def __len__(self):
        totalfiles = sum([len(files) for r, d, files in os.walk(self.ds)])
        return totalfiles

    # Load item in folder
    def __getitem__(self, index):
        # normalize index for the 0—7 label subdirectories
        normIndex = ((7 - 0) / (8000 - 0)) * (index - 8000) + 7
        normIndex = round(normIndex)

        # get label
        label = os.listdir(self.ds)[normIndex]

        # get label subdirectory
        subdir = os.path.join(self.ds, label)
        # number of images in given subdirectory
        numfiles = len(os.listdir(subdir))

        # stop from over-indexing
        normIndex_forWAV = ((numfiles - 0) / (8000 - 0)) * (index - 8000) + numfiles
        file = os.listdir(subdir)[int(normIndex_forWAV)]
        file = os.path.join(subdir, file)

        # transform file
        waveform, sr = torchaudio.load(file)
        # pad waveform tensors to [1, 16000]
        waveform = torch.nn.functional.pad(waveform, (int((16000 - len(waveform[0]))), 0))
        # convert to MFCC
        mfccT = torchaudio.transforms.MFCC()
        mfcc = mfccT(waveform)
        file = mfcc[0]
       

        return file, label


dataset = mfccDS(dataPath)

# train test val 70—20-10
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size

train_dataset, test_dataset = random_split(dataset,
                                           [train_size, test_size],
                                           generator=torch.Generator().manual_seed(42))

test_dataset, val_dataset = random_split(test_dataset, [1000, 600],
                                         generator=torch.Generator().manual_seed(42))


def label_to_index(word):
    return torch.tensor(classes.index(word))


def index_to_label(index):
    return classes[index]


def collate_fn(batch):
    tensors, targets = [], []

    for file, label in batch:
        tensors += [file]
        targets += [label_to_index(label)]

    # Group the list of tensors into a batched tensor
    tensors = torch.stack(tensors)
    targets = torch.stack(targets)

    return tensors, targets


batch_size = 32

trainloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn,
                         shuffle=True, num_workers=0)
testloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn,
                        shuffle=False, num_workers=0)
valloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn,
                       shuffle=False, num_workers=0)


# shape of mfcc

# filesWAV, labelsWAV = next(iter(trainloader))
# wav = filesWAV[31]
# print(np.shape(wav))


# build model
class Net(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 8, kernel_size=(5, 5))
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(8, 16, kernel_size=(5, 5))
        self.fc1 = nn.Linear(1904, 32)
        self.fc2 = nn.Linear(32, 32)
        self.fc3 = nn.Linear(32, num_classes)

    def forward(self, x):
        # print(x.shape)
        x = x.unsqueeze(1)
        # print(x.shape)
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = torch.flatten(x, 1)
        # print(x.shape)
        x = F.relu(self.fc1(x))
        # print(x.shape)
        x = F.relu(self.fc2(x))
        # print(x.shape)
        x = self.fc3(x)
        return x


net = Net()

# path to save model params
PATH = './net.pth'


def TrainAcc():
    correct = 0
    total = 0
    with torch.no_grad():
        net.eval()
        for data in trainloader:
            images, labels = data
            # calculate outputs by running images through the network
            outputs = net(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return 100 * correct // total


def TestAcc():
    correct = 0
    total = 0
    with torch.no_grad():
        net.eval()
        for data in testloader:
            images, labels = data
            # calculate outputs by running images through the network
            outputs = net(images)
            # the class with the highest energy is what we choose as prediction
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return 100 * correct // total


# train

def train(epoch):
    import torch.optim as optim

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=0.0001)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)

    loop = tqdm(range(epoch), colour='green')
    for epoch in loop:
        running_loss = 0.0
        for i, data in enumerate(trainloader):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # statistics
            running_loss += loss.item()

        valid_loss = 0.0
        net.eval()
        for i, data in enumerate(valloader):
            inputs, labels = data
            # Forward Pass
            outputs = net(inputs)
            # Find the Loss
            loss = criterion(outputs, labels)
            # Calculate Loss
            valid_loss += loss.item()

        loop.set_postfix(TrainLoss=running_loss / len(trainloader),
                         ValLoss=valid_loss / len(valloader),
                         TrainAcc=f'{TrainAcc()}%',
                         TestAcc=f'{TestAcc()}%')

    scheduler.step()
    torch.save(net.state_dict(), PATH)


n_epoch = 5

train(n_epoch)



# inference on test set

def inference(num):
    net.load_state_dict(torch.load(PATH))
    net.eval()

    tensor, label = test_dataset[num]
    tensor = tensor.unsqueeze(0)

    pred = net(tensor)
    pred = pred.argmax(dim=-1)
    pred = pred[0].squeeze()
    pred = index_to_label(pred)
    return label, pred


from random import randrange

randInf = randrange(len(test_dataset))

print('actual:', inference(randInf)[0], '\n' 'predicted:', inference(randInf)[1])


# inference on personal recordings
def inf2(file):
    net.load_state_dict(torch.load(PATH))
    net.eval()

    waveform, sr = torchaudio.load(file)
    waveform = torch.nn.functional.pad(waveform, (int((16000 - len(waveform[0]))), 0))
    mfccT = torchaudio.transforms.MFCC()
    mfcc = mfccT(waveform)
    file = mfcc[0]
    file = file.unsqueeze(0)

    pred = net(file)
    pred = pred.argmax(dim=-1)
    pred = pred[0].squeeze()
    pred = index_to_label(pred)
    return pred


pathInf = '../dataset/mini_speech_commands/left/1a9afd33_nohash_0.wav'
print('inf2 predicted:', inf2(pathInf))

When I run inference() on some random MFCC tensors from the test set I get results that match the reported training accuracy (ie it infers correctly). However, when I infer using inf2() on personal data separate from the training/test sets my model does not classify the tensors correctly.

With audio (or any data, really), it is quite easy to get the form in which it is presented subtly wrong (for audio e.g. sample frequence, volume (as in data range), is there silence before and after or not, …). Were you able to exclude all these?

Best regards

Thomas

Thanks for the response Tom, I’ve updated my inf2 function to use WAV files from the training dataset yet my model still infers incorrectly. Can you take a look at the updated inference function I’ve added to the bottom of this comment?
Note: my model is correctly inferring from the test_dataset using my inference function, it is just my custom inf2 function that is erroneous.

# inference on personal recordings
def inf2(file):
    net.load_state_dict(torch.load(PATH))
    net.eval()

    waveform, sr = torchaudio.load(file)
    print(sr)
    waveform = torch.nn.functional.pad(waveform, (int((16000 - len(waveform[0]))), 0))
    mfccT = torchaudio.transforms.MFCC()
    mfcc = mfccT(waveform)
    file = mfcc[0]
    file = file.unsqueeze(0)
    file = file.unsqueeze(0)

    print(file.shape)

    pred = net(file)
    pred = pred.argmax(dim=-1)
    pred = pred[0].squeeze()
    pred = index_to_label(pred)
    return pred


print('inf2 predicted:', inf2('../dataset/mini_speech_commands/go/0a9f9af7_nohash_0.wav'))

But so all the shapes are the same between using inf2 and inference (which uses the dataset)?