Test accuracy at the time of training and evaluation are different

Hi Folks,
My test accuracy at the time of training after 2 epochs is 62%. I saved the best model using the following code.

Preformatted text`
def save_checkpoint(state, is_best, filename=‘checkpoint.pth.tar’):
torch.save(state, filename)
if is_best:
shutil.copyfile(filename, ‘model_best.pth.tar’)

Then I load the model using

type or paste code here
PATH=something
# model.load_state_dict(torch.load('model.pt'))
checkpoint = torch.load(PATH)

model.load_state_dict(checkpoint['state_dict'])
print(checkpoint['best_acc1'])
test_transforms = transforms.Compose([transforms.Resize(32),
                                      transforms.ToTensor(),
                                      transforms.Normalize((.4914, .4822, .4465), (.2023, .1994, .2010))
                                      ])
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = models.resnext101_32x8d(pretrained=True)
num_features = model.fc.in_features
model.fc = nn.Linear(num_features, 3)
model = model.cuda()
PATH='/home/fengmi/semi-supervised/model_best.pth.tar'
# model.load_state_dict(torch.load('model.pt'))
checkpoint = torch.load(PATH)

model.load_state_dict(checkpoint['state_dict'])
print(checkpoint['best_acc1'])
#state_dict = torch.load(PATH)
#from collections import OrderedDict
#new_state_dict = OrderedDict()
#for k, v in state_dict.items():
 #   name = k.replace(".module", "") # remove module.
  #  new_state_dict[name] = v
#model.load_state_dict(new_state_dict)
#model.load_state_dict(torch.load(PATH), strict=False)

#model.fc = Identity()
print(model)
print(checkpoint.keys())
model.eval()

transform = transforms.Compose([
        transforms.Pad(4),
        transforms.RandomHorizontalFlip(),
        transforms.RandomCrop(32),
        transforms.ToTensor(),
        transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
def predict_image(image):
    #image_tensor = test_transforms(image).float()
    #image_tensor = image_tensor.unsqueeze_(0)
    image_tensor = image.unsqueeze_(0)
    input = Variable(image_tensor)
    input = input.to(device)
    output = model(input)
    #targets = targets.to(args.device)
    #outputs = model(inputs)
    #print(output)
    index = output.data.cpu().numpy().argmax()
    return output,index
    
import torch
import torchvision.datasets as dset



def get_random_images(num):
    #num=10
    data = ImageFolderWithPaths(root=data_dir,transform=test_transforms)
    print(len(data))
    #data = datasets.ImageFolder(data_dir, transform=transform)
    classes = 3
    indices = list(range(len(data)))
    np.random.shuffle(indices)
    idx = indices[:num]
    from torch.utils.data.sampler import SubsetRandomSampler
    #sampler = SubsetRandomSampler(idx)
    #src_path = '/home/pracheta/COVID19/COVID19_original/'
    #test_dataset = DrugDataset(src_path, train=False, src_tfs=transform)
    #test_loader = DataLoader(
     #   test_dataset,
      #  sampler=SequentialSampler(test_dataset),
       # batch_size=1,
        #num_workers=1)

    loader = torch.utils.data.DataLoader(data, batch_size=num)
    dataiter = iter(loader)
    images,labels,paths= dataiter.next()
    return images, labels, paths
    #print(paths)
    #print("\n")
    

classes= 3 different classes
to_pil = transforms.ToPILImage()
images,labels,paths = get_random_images(840)
images, labels, paths = images, labels.to(device), paths
print(images.shape)
total=(len(images))
fig = plt.figure(figsize=(10, 10))
for ii in range(len(images)):
    image = to_pil(images[ii])
    image.save("tmp.png")
    output,index = predict_image(images[ii])
    _, pred = torch.max(output.data,1)
    sub = fig.add_subplot(1, len(images), ii+1)
    res = int(labels[ii]) == index
    #print(res)
    #print(labels[ii])
    #print(paths[ii])
    #print(index)
    sub.set_title(str(classes[index]) + ":" + str(res))
    plt.axis('off')
    plt.imshow(image)
    #plt.savefig("test"+str(ii)+".png")
    plt.show()
    if res:
        correct += 1
print(correct)
print(total)
print(100 * correct/total)

what am I doing wrong?

How large is the difference between these runs?
Did you use the same test_transform before storing the state_dict?
If so, I would recommend to use exactly the same input tensor (after processing) for both runs and compare the outputs for both runs.

PS: Variables are deprecated since PyTorch 0.4, so you can use tensors now.

Hi @ptrblck I used test_transform in the training code too ? I have a separate training code for storing the state_dict.