Prediction accuracy of a trained model is much worse in testing than validation, both on the same validation set

Hope this post is not too long. I tried to make it as concise as possible.
I have trained a model with a pretty good result: 87% accuracy on validation set

def validate(model, device, dataloader, classes, criterion):
    dataset_size = len(dataloader["val"].dataset)
    batch_losses_val = 0
    batch_corrects_val = 0
    predictions_val = []
    val_labels_list = []
    example_images = []

    model.eval()
    vbar = tqdm(dataloader["val"], desc="validation iterations")
    for batch_idx, data_val in enumerate(vbar):
        val_images = data_val["image"]
        val_labels = data_val["label"]
        val_images = val_images.to(device)
        val_labels = val_labels.to(device)

        ### Evaluation model and compute loss
        with torch.no_grad():
            outputs = model(val_images)
        preds = outputs.data.max(1)[1].squeeze()
        losses = criterion(outputs, val_labels)

        batch_losses_val += losses.item()
        batch_corrects_val += torch.sum(preds == val_labels).cpu()
        vbar.set_description(
            f"Average val loss: {(batch_losses_val / (batch_idx + 1)):.3f}"
        )

        predictions_val += preds.cpu().numpy().tolist()
        val_labels_list += val_labels.cpu().numpy().tolist()

        example_images.append(
            wandb.Image(
                val_images[0],
                caption=f"Pred: {classes[preds[0].item()]} Truth: {classes[val_labels[0].item()]}",
            )
        )

    epoch_loss_val = batch_losses_val / dataset_size
    epoch_acc_val = batch_corrects_val / dataset_size
    F1_score = f1_score(predictions_val, val_labels_list, average='weighted')
    tqdm.write(
        f"Validation >>> Loss : {epoch_loss_val:.5f}, Acc : {epoch_acc_val:.4%}\n"
    )

    wandb.log(
        {
            "Validation Accuracy": epoch_acc_val * 100.0,
            "Validation Loss": epoch_loss_val,
            "Validation exmaples": example_images,
            "F1 validation score": F1_score,
        }
    )

    return epoch_loss_val, epoch_acc_val

I saved the best model like this:

# save best models
        if acc_val >= best_acc_val:
            best_epoch = epoch
            best_acc_train = acc_train
            best_acc_val = acc_val
            best_model_path = f"/content/drive/MyDrive/P6_data/models/{model.__class__.__name__}_acc={acc_val}_lr={config.lr}_wd={config.weight_decay}_do={config.dropout}.pth"
            best_model = copy.deepcopy(model)
            torch.save(best_model.state_dict(), best_model_path)

However, when I used the trained model to make prediction on the very same validation set, the accuracy dropped to around 1x%, like the model has never been trained. I make prediction like this:

def predict(
    data: pd.Series,
    img_dir: str,
    transform: Compose,
    batch_size: int,
    num_workers: int,
    model: str,
    trained_model_path: str,
    softmax: bool = False,
) -> list:
    preds = []

    dataset = BuildDataset(data, img_dir, transform=transform)
    dataloader = DataLoader(
        dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers
    )

    # Get GPU, or else CPU, device for training.
    device = "cuda:0" if torch.cuda.is_available() else "cpu"
    print(f"Using {device} device")

    # load saved model
    if model == "VGG16":
        model = VGG16(device, trained_model_path, softmax=softmax)
    print(model.state_dict())
    model.to(device)
    model.eval()

    # load images in batches
    pbar = tqdm(dataloader, desc="prediction iterations")
    for data in pbar:
        images = data["image"]
        images = images.to(device)

        # extract image features
        with torch.no_grad():
            outputs = model(images)
        if softmax:
            preds.append(outputs.data.max(1)[1].detach().numpy())
        else:
            preds.append(outputs.detach().numpy())

    return preds

But when I print and check the state_dict loaded, it is really the model that has been trained. I made a quick check:

The default pertained model:

OrderedDict([('features.0.weight', tensor([[[[-5.5373e-01,  1.4270e-01,  5.2896e-01],
          [-5.8312e-01,  3.5655e-01,  7.6566e-01],
          [-6.9022e-01, -4.8019e-02,  4.8409e-01]],

         [[ 1.7548e-01,  9.8630e-03, -8.1413e-02],
          [ 4.4089e-02, -7.0323e-02, -2.6035e-01],
          [ 1.3239e-01, -1.7279e-01, -1.3226e-01]],

         [[ 3.1303e-01, -1.6591e-01, -4.2752e-01],
          [ 4.7519e-01, -8.2677e-02, -4.8700e-01],
          [ 6.3203e-01,  1.9308e-02, -2.7753e-01]]],
....
....

The trained model below. This one with only 50% accuracy. But still, we can see that it’s different from the original default weights. And the prediction accuracy on the same validation set is always 1x% compared to much higher result during training.

OrderedDict([('features.0.weight', tensor([[[[-0.5606,  0.1344,  0.5200],
          [-0.5897,  0.3481,  0.7565],
          [-0.6963, -0.0557,  0.4754]],

         [[ 0.1677,  0.0022, -0.0890],
          [ 0.0365, -0.0780, -0.2679],
          [ 0.1250, -0.1802, -0.1400]],

         [[ 0.3036, -0.1747, -0.4359],
          [ 0.4654, -0.0918, -0.4955],
          [ 0.6222,  0.0101, -0.2866]]],
...
...

What other verifications I can make to debug please?

It seems one difference between your validation and test runs is the usage of model.eval().
If that’s the case, I would guess that e.g. the batchnorm running stats might be bad which could decrease the model performance. Depending on your use case, you could try to tune the momentum hyperparameter and check if this would improve the accuracy.

@ptrblck Thank you so much for replying!
I’m not sure what do you mean by “the usage of model.eval() is different”. For both validate and predict function (which is my testing), I follow the same order:

model.to(device)
model.eval()
for data  in dataloader:
    ...
    ...

with torch.no_grad():
   outputs =  model(images)

Should I do it differently for the predict function?

I can understand if the testing result is a bit lower than the validation result, but dropping from 87% to 1x%, it’s too dramatic, right? It shouldn’t be something can be fixed by simply tuning the momentum?

I saw your comments regarding the use of model.eval() in another post. But I’m not sure it’s my case, may be I don’t fully understand your another post. Sorry if I made you repeating stuff.