Trained triplet loss model integrated with xgboost classifier is making wrong predictions

I wrote the following code to train my model:

model.train()
min = 1.0
loss_all = []
for epoch in tqdm(range(epochs), desc="Epochs"):
    running_loss = []
    for step, (anchor_img, positive_img, negative_img, anchor_label) in enumerate(tqdm(train_loader, desc="Training", leave=False)):
        anchor_img = anchor_img.to(device)
        positive_img = positive_img.to(device)
        negative_img = negative_img.to(device)

        optimizer.zero_grad()
        anchor_out = model(anchor_img)
        positive_out = model(positive_img)
        negative_out = model(negative_img)

        loss = criterion(anchor_out, positive_out, negative_out)
        loss.backward()
        optimizer.step()

        running_loss.append(loss.cpu().detach().numpy())
    print("Epoch: {}/{} - Loss: {:.4f}".format(epoch+1, epochs, np.mean(running_loss)))
    loss_all.append(np.mean(running_loss))
    if min>=np.mean(running_loss):
      min = np.mean(running_loss)
      torch.save({"model_state_dict": model.state_dict(),
            "optimzier_state_dict": optimizer.state_dict()
           }, PATH2+"trained_model_random.pt")

      sm = torch.jit.script(model)
      sm.save(PATH2+"trained_model_random_cpp.pt")

train_results = []
labels = []
total_correct = 0
total_instances = 0

model.eval()
with torch.no_grad():
    for img, _, _, label in tqdm(train_loader):
        train_results.append(model(img.to(device)).cpu().numpy())
        labels.append(label)
        # tq = label.to(device)
        # # print(model(img.to(device)).cpu().numpy().shape)
        # predictions = torch.argmax(model(img.to(device)), dim=1)
        # # print(predictions)
        # correct_predictions = sum(predictions==tq).item()
        # total_correct+=correct_predictions
        # total_instances+=len(img)


# print(total_correct/total_instances)
train_results = np.concatenate(train_results)
# print(len(train_results[0]))
labels = np.concatenate(labels)
train_results.shape

tree = XGBClassifier(seed=2020)
tree.fit(train_results, labels)

test_results = []
test_labels = []
dk = True
model.eval()
# t1 = time.time()*1000
with torch.no_grad():
    for img in tqdm(test_loader):
        if dk: t1 = time.time()*1000
        test_results.append(model(img.to(device)).cpu().numpy())
        if dk: t2 = time.time()*1000
        dk = False

# t2 = time.time()*1000
test_results = np.concatenate(test_results)

plt.figure(figsize=(30, 25), facecolor="azure")
plt.scatter(test_results[:, 0], test_results[:, 1], label=label)

submit = pd.read_csv(PATH+"sample_submission.csv")
t1 = time.time()*1000
submit.Label = tree.predict(test_results)
t2 = time.time()*1000

tree.save_model(PATH2+"random_one_pred.json")

result = submit['Label'].tolist()
correct = 0
for i in range(len(result)):
  if(test_label[i] == result[i]):
    correct+=1
accuracy = correct*100/len(result)

There are total 100,000 images and divided them into 75-25 ratio for training and test data. The accuracy score is around 80%.

Then I wrote another code where I only load the .pt file and .json file and test the model with the images that I used for training. But everytime it is predicting the same class label. I am not understanding why my model is not working at all even for the train datatset. Here is the code snippet for this part:

def run_all(filename):
  PATH = "/content/random/"
  torch.manual_seed(2020)
  np.random.seed(2020)
  random.seed(2020)
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  if device.type == "cuda":
      torch.cuda.get_device_name()

  embedding_dims = 30
  batch_size = 32
  epochs = 60
  fileName = filename
  test_df = pd.DataFrame(
    {
        "Imagename": [fileName]
    }
  )

  test_ds = MNIST(test_df,PATH, train=False, transform=transforms.ToTensor())
  test_loader = DataLoader(test_ds, batch_size=1, shuffle=False, num_workers=4,pin_memory=True)

  # model = Network(embedding_dims)
  model = torch.jit.load(PATH+"trained_model_random_cpp.pt")
  test=""
  t1 = time.time()*1000
  model.eval()
  with torch.no_grad():
      for img in tqdm(test_loader):
          test = model(img.to(device)).cpu().numpy()
  t2 = time.time()*1000


  tree = XGBClassifier(seed=2020)
  tree.load_model(PATH+"random_one_pred.json")

  t5 = time.time()*1000
  submit = tree.predict(test)
  t6 = time.time()*1000

  print(submit)

  print(t6-t5)

Did you check which part of the pipeline is different in both scripts?
If not, I would recommend checking the PyTorch model’s predictions first and afterwards the XGBClassifier’s to isolate the issue further.
E.g. if the PyTorch model is not returning the expected outputs in the second script (compared to the first script), check if the data loading and processing are the same or if you changed it.