Hope this post is not too long. I tried to make it as concise as possible.
I have trained a model with a pretty good result: 87% accuracy on validation set
def validate(model, device, dataloader, classes, criterion):
dataset_size = len(dataloader["val"].dataset)
batch_losses_val = 0
batch_corrects_val = 0
predictions_val = []
val_labels_list = []
example_images = []
model.eval()
vbar = tqdm(dataloader["val"], desc="validation iterations")
for batch_idx, data_val in enumerate(vbar):
val_images = data_val["image"]
val_labels = data_val["label"]
val_images = val_images.to(device)
val_labels = val_labels.to(device)
### Evaluation model and compute loss
with torch.no_grad():
outputs = model(val_images)
preds = outputs.data.max(1)[1].squeeze()
losses = criterion(outputs, val_labels)
batch_losses_val += losses.item()
batch_corrects_val += torch.sum(preds == val_labels).cpu()
vbar.set_description(
f"Average val loss: {(batch_losses_val / (batch_idx + 1)):.3f}"
)
predictions_val += preds.cpu().numpy().tolist()
val_labels_list += val_labels.cpu().numpy().tolist()
example_images.append(
wandb.Image(
val_images[0],
caption=f"Pred: {classes[preds[0].item()]} Truth: {classes[val_labels[0].item()]}",
)
)
epoch_loss_val = batch_losses_val / dataset_size
epoch_acc_val = batch_corrects_val / dataset_size
F1_score = f1_score(predictions_val, val_labels_list, average='weighted')
tqdm.write(
f"Validation >>> Loss : {epoch_loss_val:.5f}, Acc : {epoch_acc_val:.4%}\n"
)
wandb.log(
{
"Validation Accuracy": epoch_acc_val * 100.0,
"Validation Loss": epoch_loss_val,
"Validation exmaples": example_images,
"F1 validation score": F1_score,
}
)
return epoch_loss_val, epoch_acc_val
I saved the best model like this:
# save best models
if acc_val >= best_acc_val:
best_epoch = epoch
best_acc_train = acc_train
best_acc_val = acc_val
best_model_path = f"/content/drive/MyDrive/P6_data/models/{model.__class__.__name__}_acc={acc_val}_lr={config.lr}_wd={config.weight_decay}_do={config.dropout}.pth"
best_model = copy.deepcopy(model)
torch.save(best_model.state_dict(), best_model_path)
However, when I used the trained model to make prediction on the very same validation set, the accuracy dropped to around 1x%, like the model has never been trained. I make prediction like this:
def predict(
data: pd.Series,
img_dir: str,
transform: Compose,
batch_size: int,
num_workers: int,
model: str,
trained_model_path: str,
softmax: bool = False,
) -> list:
preds = []
dataset = BuildDataset(data, img_dir, transform=transform)
dataloader = DataLoader(
dataset, batch_size=batch_size, shuffle=True, num_workers=num_workers
)
# Get GPU, or else CPU, device for training.
device = "cuda:0" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
# load saved model
if model == "VGG16":
model = VGG16(device, trained_model_path, softmax=softmax)
print(model.state_dict())
model.to(device)
model.eval()
# load images in batches
pbar = tqdm(dataloader, desc="prediction iterations")
for data in pbar:
images = data["image"]
images = images.to(device)
# extract image features
with torch.no_grad():
outputs = model(images)
if softmax:
preds.append(outputs.data.max(1)[1].detach().numpy())
else:
preds.append(outputs.detach().numpy())
return preds
But when I print and check the state_dict loaded, it is really the model that has been trained. I made a quick check:
The default pertained model:
OrderedDict([('features.0.weight', tensor([[[[-5.5373e-01, 1.4270e-01, 5.2896e-01],
[-5.8312e-01, 3.5655e-01, 7.6566e-01],
[-6.9022e-01, -4.8019e-02, 4.8409e-01]],
[[ 1.7548e-01, 9.8630e-03, -8.1413e-02],
[ 4.4089e-02, -7.0323e-02, -2.6035e-01],
[ 1.3239e-01, -1.7279e-01, -1.3226e-01]],
[[ 3.1303e-01, -1.6591e-01, -4.2752e-01],
[ 4.7519e-01, -8.2677e-02, -4.8700e-01],
[ 6.3203e-01, 1.9308e-02, -2.7753e-01]]],
....
....
The trained model below. This one with only 50% accuracy. But still, we can see that it’s different from the original default weights. And the prediction accuracy on the same validation set is always 1x% compared to much higher result during training.
OrderedDict([('features.0.weight', tensor([[[[-0.5606, 0.1344, 0.5200],
[-0.5897, 0.3481, 0.7565],
[-0.6963, -0.0557, 0.4754]],
[[ 0.1677, 0.0022, -0.0890],
[ 0.0365, -0.0780, -0.2679],
[ 0.1250, -0.1802, -0.1400]],
[[ 0.3036, -0.1747, -0.4359],
[ 0.4654, -0.0918, -0.4955],
[ 0.6222, 0.0101, -0.2866]]],
...
...
What other verifications I can make to debug please?