Saved model provides 1%-3% lower test accuracy for the same test set

Hi,
I have saved the model that produces best accuracy. Now when, I am loading that saved model and just doing the test using the same test set, I am seeing that the test accuracy is inconsistent. I have ran the test for multiple times and every time it produces different accuracy (1%-3% lower than the original) where as the consistent score is expected. For example, I saved the model that produced 85.75% test accuracy, it should always produce the same accuracy for the same test set, isn’t it.

This is my model:

class Env2Acl(nn.Module):
   def __init__(self, input_length, n_class, sr):
       super(Env2Acl, self).__init__();
       self.input_length = input_length;

       stride1 = 2;
       stride2 = 2;
       channels = 8;
       k_size = (3, 3);
       n_frames = (sr/1000)*10; #No of frames per 10ms

       self.filter_bank_pool_size = int(n_frames/(stride1*stride2));
       self.pool_size = (2,2);

       self.conv1, self.bn1 = ConvLayer(1, channels, (1, 9), (1, stride1)).get();
       self.conv2, self.bn2 = ConvLayer(channels, channels*8, (1, 5), (1, stride2)).get();
       self.conv3, self.bn3 = ConvLayer(1, channels*4, k_size, padding=1).get();
       self.conv4, self.bn4 = ConvLayer(channels*4, channels*8, k_size, padding=1).get();
       self.conv5, self.bn5 = ConvLayer(channels*8, channels*8, k_size, padding=1).get();
       self.conv6, self.bn6 = ConvLayer(channels*8, channels*16, k_size, padding=1).get();
       self.conv7, self.bn7 = ConvLayer(channels*16, channels*16, k_size, padding=1).get();
       self.conv8, self.bn8 = ConvLayer(channels*16, channels*32, k_size, padding=1).get();
       self.conv9, self.bn9 = ConvLayer(channels*32, channels*32, k_size, padding=1).get();
       self.conv10, self.bn10 = ConvLayer(channels*32, channels*64, k_size, padding=1).get();
       self.conv11, self.bn11 = ConvLayer(channels*64, channels*64, k_size, padding=1).get();
       self.conv12, self.bn12 = ConvLayer(channels*64, n_class, (1, 1)).get();

       self.maxpool1 = nn.MaxPool2d(kernel_size=(1, self.filter_bank_pool_size));
       self.maxpool2 = nn.MaxPool2d(kernel_size=(2,2));
       self.avgpool = nn.AvgPool2d(kernel_size=(2,4));
       self.fcn = nn.Linear(n_class, n_class);
       nn.init.kaiming_normal_(self.fcn.weight, nonlinearity='sigmoid') # kaiming with sigoid is equivalent to lecun_normal in keras

   def forward(self, x):
       #Start: Filter bank
       x = F.relu(self.bn1(self.conv1(x)));
       x = F.relu(self.bn2(self.conv2(x)));
       x = self.maxpool1(x);
       #Start: Filter bank

       #swapaxes
       x = x.permute((0, 2, 1, 3));

       x = self.maxpool2(F.relu(self.bn3(self.conv3(x))));

       x = F.relu(self.bn4(self.conv4(x)));
       x = self.maxpool2(F.relu(self.bn5(self.conv5(x))));

       x = F.relu(self.bn6(self.conv6(x)));
       x = self.maxpool2(F.relu(self.bn7(self.conv7(x))));

       x = F.relu(self.bn8(self.conv8(x)));
       x = self.maxpool2(F.relu(self.bn9(self.conv9(x))));

       x = F.relu(self.bn10(self.conv10(x)));
       x = self.maxpool2(F.relu(self.bn11(self.conv11(x))));

       x =  nn.Dropout(0.2)(x);
       x = self.avgpool(F.relu(self.bn12(self.conv12(x))));

       x = nn.Flatten()(x);

       y = F.softmax(self.fcn(x), dim=1);
       return y;

class ConvLayer:
   def __init__(self, in_channels, out_channels, kernel_size, stride=(1,1), padding=0, bias=False):
       self.conv = nn.Conv2d(in_channels=in_channels, out_channels=out_channels, kernel_size=kernel_size, stride=stride, padding=padding, bias=bias);
       nn.init.kaiming_normal_(self.conv.weight, nonlinearity='relu'); # kaiming with relu is equivalent to he_normal in keras
       self.bn = nn.BatchNorm2d(out_channels);

   def get(self):
       return self.conv, self.bn;

def GetModel():
   net = Env2Acl(66650, 50, 44100);
   return net;
def Train(self):
        train_start_time = time.time();
        print(self.device);
        net = model.GetModel().to(self.device);
        dir = os.getcwd();
        lossFunc = torch.nn.KLDivLoss(reduction='batchmean');
        model_path = dir+'/comp_nets/env2acl_trained_no_attack.pth';
        if os.path.isfile(model_path):
            net.load_state_dict(torch.load(model_path, map_location=self.device));
            print('Model Loaded');
            net.eval();
            val_acc, val_loss = self.__validate(net, loss_func);
            print('Testing - Val: Loss {:.3f}  Acc(top1) {:.3f}%'.format(val_loss, val_acc));
            netl.train();
        optimizer = optim.SGD(net.parameters(), lr=self.opt.LR, weight_decay=self.opt.weightDecay, momentum=self.opt.momentum, nesterov=True)

        for epochIdx in range(self.opt.nEpochs):
            epoch_start_time = time.time();
            optimizer.param_groups[0]['lr'] = self.__get_lr(epochIdx+1);
            cur_lr = optimizer.param_groups[0]['lr'];
            self.load_data(epochIdx+1);
            running_loss = 0.0;
            running_acc = 0.0;
            n_batches = math.ceil(len(self.trainX)/self.opt.batchSize);
            for batchIdx in range(n_batches):
                # with torch.no_grad():
                x,y = self.__get_batch(batchIdx);

                # zero the parameter gradients
                optimizer.zero_grad();

                # forward + backward + optimize
                outputs = net(x);
                running_acc += (((outputs.data.argmax(dim=1) == y.argmax(dim=1))*1).float().mean()).item();
                loss = lossFunc(outputs.log(), y);
                loss.backward();
                optimizer.step();

                running_loss += loss.item();

            tr_acc = (running_acc / n_batches)*100;
            tr_loss = running_loss / n_batches;

            #Epoch wise validation Validation
            epoch_train_time = time.time() - epoch_start_time;
            net.eval();
            val_acc, val_loss = self.__validate(net, lossFunc);

            #Save best model
            self.__save_model(val_acc, epochIdx, net);

            self.__on_epoch_end(epoch_start_time, epoch_train_time, epochIdx, cur_lr, tr_loss, tr_acc, val_loss, val_acc);

            running_loss = 0;
            running_acc = 0;
            net.train();

        total_time_taken = time.time() - train_start_time;
        print("Execution finished in: {}".format(U.to_hms(total_time_taken)));
def __validate(self, net, lossFunc):
        with torch.no_grad():
            y_pred = None;
            batch_size = (self.opt.batchSize//self.opt.nCrops)*self.opt.nCrops
            for idx in range(math.ceil(len(self.testX)/batch_size)):
                x = self.testX[idx*batch_size : (idx+1)*batch_size];
                scores = net(x);
                y_pred = scores.data if y_pred is None else torch.cat((y_pred, scores.data));

            acc, loss = self.__compute_accuracy(y_pred, self.testY, lossFunc);
        return acc, loss;

Not entirely sure what is happening. Is it because I am not saving the entire model that allows some components to be initialised randomly every time?

Any thought would be much appreciated.

I think that it can be possibly due to different batchsize or different distribution of batch data each time. I am not sure but yeah may be this can be a reason

@chetan06 The batch size and the batch data distribution is exactly the same every time.

Hi, I am now using fixed seed.

seed = 3
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

Now, I am seeing exactly the same test accuracy every time but that is 1% less than the original.
The saved model produced 85.75% test accuracy. Now after reloading it is producing 84.75% every time I run the test set.