I want to build wake word detection model using RNN.
I built Dataset for it.
from pathlib import Path
import pandas as pd
import torch
import torchaudio
class WakeWordDataSet(torch.utils.data.Dataset):
def __init__(self, csv_file: Path, sample_rate: int):
self._data = pd.read_csv(csv_file, header=None)
self._sample_rate = sample_rate
self._mfcc = torchaudio.transforms.MFCC(sample_rate)
def __len__(self):
return len(self._data)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
data = self._data.iloc[idx]
waveform, sample_rate = torchaudio.load(data[0])
label = data[1]
if sample_rate > self._sample_rate:
waveform = torchaudio.transforms.Resample(sample_rate, self._sample_rate)(waveform)
mfcc = self._mfcc(waveform)
return mfcc, label
It’s simply computes mfcc.
Here’s my model
import torch
import torch.nn as nn
class WakeWordDetector(nn.Module):
def __init__(self, seq_length, hidden_size, num_layers):
super(WakeWordDetector, self).__init__()
self._gru = nn.GRU(seq_length, hidden_size, num_layers=num_layers, dropout=0.25)
self._classifier = nn.Linear(num_layers * hidden_size, out_features=1)
def forward(self, x: torch.Tensor):
# x => batch_size, seq_len, features
x = x.transpose(0, 1)
_, x = self._gru(x)
x = x.transpose(0, 1)
x = x.flatten(1)
# print(x.shape)
x = self._classifier(x)
return x.squeeze_(-1)
And here’s my training scipt
from pathlib import Path
from sklearn.metrics import f1_score
import torch
import torch.optim as optim
from torch.utils.data import DataLoader
from wakeword.dataset import WakeWordDataSet
from wakeword.model import WakeWordDetector
from wakeword.utils import collate_fn
train_data = WakeWordDataSet(Path('./wakeword/train.csv'), 8000)
test_data = WakeWordDataSet(Path('./wakeword/test.csv'), 8000)
model = WakeWordDetector(40, 64, num_layers=4)
train_loader = DataLoader(dataset=train_data, batch_size=32, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=32, collate_fn=collate_fn)
epochs = 100
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
opt = optim.AdamW(model.parameters(), lr=0.0001)
criterion = torch.nn.BCELoss()
for epoch in range(epochs):
running_loss = 0.0
running_f1 = 0.0
model.train(True)
for i, data in enumerate(train_loader):
mfccs = data[0].to(device)
labels = data[1].to(device)
opt.zero_grad()
outputs = model(mfccs)
outputs = torch.sigmoid(outputs)
loss = criterion(outputs, labels)
rounded = torch.round(outputs)
loss.backward()
opt.step()
running_f1 += f1_score(labels.cpu().numpy(), rounded.cpu().detach().numpy())
running_loss += loss.item()
if i % 20 == 19:
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 20))
print('[%d, %5d] f1: %.3f' %
(epoch + 1, i + 1, running_f1 / 20))
running_loss = 0.0
running_f1 = 0.0
test_f1 = 0.0
test_loss = 0.0
model.eval()
with torch.no_grad():
for i, test_data in enumerate(test_loader):
mfccs = test_data[0].to(device)
labels = test_data[1].to(device)
outputs = model(mfccs)
outputs = torch.sigmoid(outputs)
rounded = torch.round(outputs)
current_f1 = f1_score(labels.cpu().numpy(), rounded.cpu().detach().numpy())
test_f1 += current_f1
loss = criterion(outputs, labels)
test_loss += loss.item()
print(f'test loss: {test_loss / len(test_loader)}')
print(f'test f1: {test_f1 / len(test_loader)}')
if test_f1 / len(test_loader) == 1.0:
torch.save(model.state_dict(), f'model{epoch}.pt')
break
I have imbalanced dataset so i’m using f1 score.
So this model converges pretty fast and i can see it mysewlf using debugger. Outputs are really good. It can archive 1.0 f1 score but when i want to reload this model in some scrpit for example:
import os
import torch
import torchaudio
from wakeword.model import WakeWordDetector
checkpoint = torch.load('./model0.pt')
model = WakeWordDetector(40, 64, num_layers=4)
model.load_state_dict(checkpoint)
model.eval()
tr = torchaudio.transforms.MFCC(8000)
with torch.no_grad():
for file in os.listdir('./sounds/train/1'):
w, _ = torchaudio.load('./sounds/train/1/' + file)
w = tr(w)
w = w.transpose(1, -1)
print((torch.sigmoid(model(w))))
it seems strange and doesn’t really work. If I use LSTM all outputs are about the same and if i use GRU outputs are slightly different. I checked inputs/outputs shapes they seem ok so I don’t know what’s the problem is:(
UPD: I’m using pytorch 1.6.0, python 3.6.11, ubuntu 20