Model.eval() gives incorrect loss for model with batchnorm layers

The testing code which is called every 100 training iters is as following:

def test_model(model):
    psnr_test_avg = 0
    loss_test_avg = 0
    model.eval()
    
    for iteration, test_data in enumerate(dataloader_test, 0):
        label_test = test_data['label_patch']
        residue_test = test_data['residue_patch']
        stacked_test = test_data['stacked_patch']
        microshift_test = test_data['train_patch']
        inputs_test, residues_test, labels_test, microshifts_test = Variable(stacked_test.type(dtype), requires_grad=False), Variable(residue_test.type(dtype), requires_grad=False), Variable(label_test.type(dtype), requires_grad=False), Variable(microshift_test.type(dtype), requires_grad=False)
        outputs_test = model(inputs_test)
        loss_mse_test = criterion_mse(outputs_test + microshifts_test, labels_test).data.cpu().numpy()
        loss_l1_test = criterion(outputs_test, residues_test).data.cpu().numpy()
        psnr_test = 10 * np.log10(255 * 255 / loss_mse_test)
        loss_test_avg += loss_l1_test
        psnr_test_avg += psnr_test

    loss_test_avg /= (iteration + 1)
    psnr_test_avg /= (iteration + 1)

    return loss_test_avg, psnr_test_avg