Can't reproduce output of RNN

I want to build wake word detection model using RNN.
I built Dataset for it.

from pathlib import Path

import pandas as pd
import torch
import torchaudio

class WakeWordDataSet(
    def __init__(self, csv_file: Path, sample_rate: int):
        self._data = pd.read_csv(csv_file, header=None)
        self._sample_rate = sample_rate
        self._mfcc = torchaudio.transforms.MFCC(sample_rate)

    def __len__(self):
        return len(self._data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        data = self._data.iloc[idx]
        waveform, sample_rate = torchaudio.load(data[0])
        label = data[1]
        if sample_rate > self._sample_rate:
            waveform = torchaudio.transforms.Resample(sample_rate, self._sample_rate)(waveform)
        mfcc = self._mfcc(waveform)
        return mfcc, label

It’s simply computes mfcc.
Here’s my model

import torch

import torch.nn as nn

class WakeWordDetector(nn.Module):
    def __init__(self, seq_length, hidden_size, num_layers):
        super(WakeWordDetector, self).__init__()
        self._gru = nn.GRU(seq_length, hidden_size, num_layers=num_layers, dropout=0.25)
        self._classifier = nn.Linear(num_layers * hidden_size, out_features=1)

    def forward(self, x: torch.Tensor):
        # x => batch_size, seq_len, features
        x = x.transpose(0, 1)
        _, x = self._gru(x)
        x = x.transpose(0, 1)
        x = x.flatten(1)
        # print(x.shape)
        x = self._classifier(x)
        return x.squeeze_(-1)

And here’s my training scipt

from pathlib import Path
from sklearn.metrics import f1_score
import torch
import torch.optim as optim
from import DataLoader

from wakeword.dataset import WakeWordDataSet
from wakeword.model import WakeWordDetector
from wakeword.utils import collate_fn

train_data = WakeWordDataSet(Path('./wakeword/train.csv'), 8000)
test_data = WakeWordDataSet(Path('./wakeword/test.csv'), 8000)
model = WakeWordDetector(40, 64, num_layers=4)

train_loader = DataLoader(dataset=train_data, batch_size=32, collate_fn=collate_fn, shuffle=True)
test_loader = DataLoader(dataset=test_data, batch_size=32, collate_fn=collate_fn)

epochs = 100

device = 'cuda' if torch.cuda.is_available() else 'cpu'

model =

opt = optim.AdamW(model.parameters(), lr=0.0001)

criterion = torch.nn.BCELoss()
for epoch in range(epochs):
    running_loss = 0.0
    running_f1 = 0.0
    for i, data in enumerate(train_loader):
        mfccs = data[0].to(device)
        labels = data[1].to(device)

        outputs = model(mfccs)
        outputs = torch.sigmoid(outputs)
        loss = criterion(outputs, labels)
        rounded = torch.round(outputs)
        running_f1 += f1_score(labels.cpu().numpy(), rounded.cpu().detach().numpy())
        running_loss += loss.item()
        if i % 20 == 19:
            print('[%d, %5d] loss: %.3f' %
                  (epoch + 1, i + 1, running_loss / 20))
            print('[%d, %5d] f1: %.3f' %
                  (epoch + 1, i + 1, running_f1 / 20))
            running_loss = 0.0
            running_f1 = 0.0

    test_f1 = 0.0
    test_loss = 0.0
    with torch.no_grad():
        for i, test_data in enumerate(test_loader):
            mfccs = test_data[0].to(device)
            labels = test_data[1].to(device)

            outputs = model(mfccs)
            outputs = torch.sigmoid(outputs)
            rounded = torch.round(outputs)
            current_f1 = f1_score(labels.cpu().numpy(), rounded.cpu().detach().numpy())
            test_f1 += current_f1
            loss = criterion(outputs, labels)
            test_loss += loss.item()

        print(f'test loss: {test_loss / len(test_loader)}')
        print(f'test f1: {test_f1 / len(test_loader)}')
        if test_f1 / len(test_loader) == 1.0:
  , f'model{epoch}.pt')

I have imbalanced dataset so i’m using f1 score.
So this model converges pretty fast and i can see it mysewlf using debugger. Outputs are really good. It can archive 1.0 f1 score but when i want to reload this model in some scrpit for example:

import os

import torch

import torchaudio

from wakeword.model import WakeWordDetector

checkpoint = torch.load('./')
model = WakeWordDetector(40, 64, num_layers=4)
tr = torchaudio.transforms.MFCC(8000)
with torch.no_grad():
    for file in os.listdir('./sounds/train/1'):
        w, _ = torchaudio.load('./sounds/train/1/' + file)
        w = tr(w)
        w = w.transpose(1, -1)

it seems strange and doesn’t really work. If I use LSTM all outputs are about the same and if i use GRU outputs are slightly different. I checked inputs/outputs shapes they seem ok so I don’t know what’s the problem is:(
UPD: I’m using pytorch 1.6.0, python 3.6.11, ubuntu 20

I feel like I almost solved this by myself.
It seems like I must specify ‘map_location’ parameter.
Like this checkpoint = torch.load('./models/', map_location='cpu')

UPD: Well this didn’t really work after I changed model.
So still help required!

This is output i’m getting if i run model on folder with files that were used for test

Did you split the initial dataset into a training, validation, and test set?
If so, how was the validation and test score in comparison to the training score?

If you did not apply the splitting, your model might just overfit to the training dataset and thus perform badly for unseen data.