My NN returns different labels on model.eval every time I run it

I am trying to train a model to recognize attributes like the weather in images and it seems that is quite accurate on train and validation sets of the same dataset but it performs poorly on a different dataset and returns different labels each I run it.
For NN I used a pretrained ResNet. I cannot understand what went wrong, so I would appreciate any help or suggestion.

model_conv=torchvision.models.resnet50(pretrained=True)
model_conv.fc = nn.Sequential(nn.Linear(2048, 40), nn.Sigmoid())
model_conv = model_conv.to(device)
# Loss Function
criterion = nn.BCELoss()
optimizer_ft = optim.Adam([
                {'params': list(model_conv.parameters())[:-1]},
                {'params': list(model_conv.parameters())[-1], 'lr': 1e-3}
            ], lr=1e-4, weight_decay=0)
exp_lr_scheduler = lr_scheduler.StepLR(optimizer_ft, step_size=2, gamma=0.7)

and the training function:


def train_model(model, dataloaders, criterion, optimizer, scheduler, batch_size=5, 

        # Each epoch has a training and validation phase
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Set model to training mode
            else:
                model.eval()   # Set model to evaluate mode

            running_loss = 0.0
            running_corrects = 0
            average_precis = 0.001
            loss_values = []


            print('Iterating over data:')
            for batch_idx, (inputs, labels) in enumerate(dataloaders[phase]):
                inputs = inputs.to(device)
                labels = labels.to(device).float()
                gt_data = labels
                gt_data = gt_data.to(device)
                gt_data = gt_data.cpu().data.numpy()
                average_precision_array = []
                # zero the parameter gradients
                optimizer.zero_grad()
                # forward
                # track history if only in train
                if phase == 'train':
                  with torch.set_grad_enabled(phase == 'train'):
                      outputs = model(inputs)
                      outputs = outputs.cpu()#.data.numpy()
                      preds = outputs.cpu().data.numpy()
                      preds = np.round(preds) #set a condition for binary
                      preds_int = preds.astype(int)
                      gt_data_np = np.round(gt_data)
                      gt_data_int = gt_data_np.astype(int)
                      gt_data = torch.from_numpy(gt_data_np)
                      loss = criterion(outputs, gt_data)

                    # backward + optimize only if in training phase
                      if phase == 'train':
                          loss.backward()
                          optimizer.step()
                # statistics
                  running_loss += loss.item() * inputs.size(0)
                  running_corrects += f1_score(gt_data, preds, average="samples")

                #Validation set
                else:
                  with torch.no_grad():
                      val_outputs = model(inputs)
                      val_outputs = val_outputs.cpu()#.data.numpy()
                      val_preds = val_outputs.cpu().data.numpy()
                      val_preds = np.round(val_preds) #set a condition for binary
                      val_preds = val_preds.astype(int)
                      val_gtdata_np = np.round(gt_data)
                      val_gtdata_int = val_gtdata_np.astype(int)
                      val_gtdata = torch.from_numpy(val_gtdata_np)
                      loss = criterion(val_outputs, val_gtdata)

                  # statistics
                  running_loss += loss.item() * inputs.size(0)
                  running_corrects += f1_score(val_gtdata, val_preds, average='samples')
                       
                
            if phase == 'train':
                scheduler.step()
                average_precis_train += average_precision_score(gr_truth_array, preds_array, average= "macro")
                print("Average precision Training:", average_precis_train)      


            epoch_loss = running_loss / len(dataloaders[phase].dataset)
            epoch_acc = running_corrects / len(dataloaders[phase].dataset)
            epoch_acc = np.round(epoch_acc, decimals=4)
            
            if phase == 'val':
              scheduler.step()
              epoch_loss = running_loss / len(dataloaders[phase].dataset)
              epoch_acc = running_corrects / len(dataloaders[phase].dataset) 
              epoch_acc = np.round(epoch_acc, decimals=4)
              average_precis += average_precision_score(gr_truth_val, preds_val, average="macro")
              print("Average precision Validation:", average_precis)

            print('{} Loss: {:.4f}'.format(phase, epoch_loss))
            print("Acc:", epoch_acc)

            #Visualize a few images
            if phase == 'val':
              for jp in range(inputs.size()[0]):
                  ax = plt.subplot(1, 4, jp + 1)
                  plt.figure(figsize=[5, 4])
                  ax.axis('off')
                  ax.set_title('Sample #{}'.format(jp))
                  img = inputs.cpu().data[jp].numpy().transpose((1, 2, 0))
                  for value in val_outputs:
                    show_scores(img, value, attributes)
                    break      

                  plt.show()
                  break 
        
    print()  
    
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60)

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, val_acc_history

Do you mean that the model output is different for the same input? That is unexpected for a ResNet with model.eval().

e.g., what do you get when you run something like

inp = torch.randn(1, 3, 224, 224, device='cuda')
model.eval()
out1 = model(inp)
out2 = model(inp)
print(torch.allclose(out1, out2))

Sorry, maybe I was not clear. I have this dataset (Let’s call it A) split into train and validation. When I train the two sets of A dataset (see train function above) everything works correctly but when I want to test the model on a different image dataset it returns different results every time. For example for one image I might get labels like clouds, storm etc. and if I run the model again I get different labels for the same image.