My speech recognition model below reports > 90% test & train accuracies after 5 epochs thus infers good on the training/test datasets yet its prediction is not correct on any data outside of these sets. Why is this happening with such high reported accuracy?
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from PIL import Image
import os
from tqdm import tqdm
import numpy as np
if torch.backends.mps.is_available() and torch.backends.mps.is_built():
device = torch.device('mps')
else:
device = torch.device('cpu')
print('device:', device)
dataPath = '../dataset/mini_speech_commands'
classes = ('down', 'go', 'left', 'no', 'right', 'stop', 'up', 'yes')
num_classes = len(classes)
class mfccDS(Dataset):
def __init__(self, data_dir):
self.ds = data_dir
# The length of the dataset
def __len__(self):
totalfiles = sum([len(files) for r, d, files in os.walk(self.ds)])
return totalfiles
# Load item in folder
def __getitem__(self, index):
# normalize index for the 0—7 label subdirectories
normIndex = ((7 - 0) / (8000 - 0)) * (index - 8000) + 7
normIndex = round(normIndex)
# get label
label = os.listdir(self.ds)[normIndex]
# get label subdirectory
subdir = os.path.join(self.ds, label)
# number of images in given subdirectory
numfiles = len(os.listdir(subdir))
# stop from over-indexing
normIndex_forWAV = ((numfiles - 0) / (8000 - 0)) * (index - 8000) + numfiles
file = os.listdir(subdir)[int(normIndex_forWAV)]
file = os.path.join(subdir, file)
# transform file
waveform, sr = torchaudio.load(file)
# pad waveform tensors to [1, 16000]
waveform = torch.nn.functional.pad(waveform, (int((16000 - len(waveform[0]))), 0))
# convert to MFCC
mfccT = torchaudio.transforms.MFCC()
mfcc = mfccT(waveform)
file = mfcc[0]
return file, label
dataset = mfccDS(dataPath)
# train test val 70—20-10
train_size = int(0.8 * len(dataset))
test_size = len(dataset) - train_size
train_dataset, test_dataset = random_split(dataset,
[train_size, test_size],
generator=torch.Generator().manual_seed(42))
test_dataset, val_dataset = random_split(test_dataset, [1000, 600],
generator=torch.Generator().manual_seed(42))
def label_to_index(word):
return torch.tensor(classes.index(word))
def index_to_label(index):
return classes[index]
def collate_fn(batch):
tensors, targets = [], []
for file, label in batch:
tensors += [file]
targets += [label_to_index(label)]
# Group the list of tensors into a batched tensor
tensors = torch.stack(tensors)
targets = torch.stack(targets)
return tensors, targets
batch_size = 32
trainloader = DataLoader(train_dataset, batch_size=batch_size, collate_fn=collate_fn,
shuffle=True, num_workers=0)
testloader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn,
shuffle=False, num_workers=0)
valloader = DataLoader(val_dataset, batch_size=batch_size, collate_fn=collate_fn,
shuffle=False, num_workers=0)
# shape of mfcc
# filesWAV, labelsWAV = next(iter(trainloader))
# wav = filesWAV[31]
# print(np.shape(wav))
# build model
class Net(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 8, kernel_size=(5, 5))
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(8, 16, kernel_size=(5, 5))
self.fc1 = nn.Linear(1904, 32)
self.fc2 = nn.Linear(32, 32)
self.fc3 = nn.Linear(32, num_classes)
def forward(self, x):
# print(x.shape)
x = x.unsqueeze(1)
# print(x.shape)
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = torch.flatten(x, 1)
# print(x.shape)
x = F.relu(self.fc1(x))
# print(x.shape)
x = F.relu(self.fc2(x))
# print(x.shape)
x = self.fc3(x)
return x
net = Net()
# path to save model params
PATH = './net.pth'
def TrainAcc():
correct = 0
total = 0
with torch.no_grad():
net.eval()
for data in trainloader:
images, labels = data
# calculate outputs by running images through the network
outputs = net(images)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
return 100 * correct // total
def TestAcc():
correct = 0
total = 0
with torch.no_grad():
net.eval()
for data in testloader:
images, labels = data
# calculate outputs by running images through the network
outputs = net(images)
# the class with the highest energy is what we choose as prediction
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
return 100 * correct // total
# train
def train(epoch):
import torch.optim as optim
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(net.parameters(), lr=0.001, weight_decay=0.0001)
scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=20, gamma=0.1)
loop = tqdm(range(epoch), colour='green')
for epoch in loop:
running_loss = 0.0
for i, data in enumerate(trainloader):
# get the inputs; data is a list of [inputs, labels]
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# statistics
running_loss += loss.item()
valid_loss = 0.0
net.eval()
for i, data in enumerate(valloader):
inputs, labels = data
# Forward Pass
outputs = net(inputs)
# Find the Loss
loss = criterion(outputs, labels)
# Calculate Loss
valid_loss += loss.item()
loop.set_postfix(TrainLoss=running_loss / len(trainloader),
ValLoss=valid_loss / len(valloader),
TrainAcc=f'{TrainAcc()}%',
TestAcc=f'{TestAcc()}%')
scheduler.step()
torch.save(net.state_dict(), PATH)
n_epoch = 5
train(n_epoch)
# inference on test set
def inference(num):
net.load_state_dict(torch.load(PATH))
net.eval()
tensor, label = test_dataset[num]
tensor = tensor.unsqueeze(0)
pred = net(tensor)
pred = pred.argmax(dim=-1)
pred = pred[0].squeeze()
pred = index_to_label(pred)
return label, pred
from random import randrange
randInf = randrange(len(test_dataset))
print('actual:', inference(randInf)[0], '\n' 'predicted:', inference(randInf)[1])
# inference on personal recordings
def inf2(file):
net.load_state_dict(torch.load(PATH))
net.eval()
waveform, sr = torchaudio.load(file)
waveform = torch.nn.functional.pad(waveform, (int((16000 - len(waveform[0]))), 0))
mfccT = torchaudio.transforms.MFCC()
mfcc = mfccT(waveform)
file = mfcc[0]
file = file.unsqueeze(0)
pred = net(file)
pred = pred.argmax(dim=-1)
pred = pred[0].squeeze()
pred = index_to_label(pred)
return pred
pathInf = '../dataset/mini_speech_commands/left/1a9afd33_nohash_0.wav'
print('inf2 predicted:', inf2(pathInf))
When I run inference() on some random MFCC tensors from the test set I get results that match the reported training accuracy (ie it infers correctly). However, when I infer using inf2() on personal data separate from the training/test sets my model does not classify the tensors correctly.