Different results Pytorch vs Libtorch

Hello!
I recently succeded to save a model from pytorch and load in libtorch.
I am using the mono_640x192 pretrained model.
I am trying to calculate the disparity map for a single picture, but I get different results in python compared to c++.
I tried and followed to code in python to process the image in c++, yet still I get a strange result.
Here is the code I followed in python

 with torch.no_grad():
    for idx, image_path in enumerate(paths):

        if image_path.endswith("_disp.jpg"):
            # don't try to predict disparity for a disparity image!
            continue

        # Load image and preprocess
        input_image = pil.open(image_path).convert('RGB')
        original_width, original_height = input_image.size
        input_image = input_image.resize((feed_width, feed_height), pil.LANCZOS)
        input_image = transforms.ToTensor()(input_image).unsqueeze(0)

        # PREDICTION
        input_image = input_image.to(device)
        features = encoder(input_image)
        outputs = depth_decoder(features)
        disp = outputs[("disp", 0)]
        disp_resized = torch.nn.functional.interpolate(
            disp, (original_height, original_width), mode="bilinear", align_corners=False)
        output_name = os.path.splitext(os.path.basename(image_path))[0]
        scaled_disp, depth = disp_to_depth(disp, 0.1, 100)
        disp_resized_np = disp_resized.squeeze().cpu().numpy()
        vmax = np.percentile(disp_resized_np, 95)
        normalizer = mpl.colors.Normalize(vmin=disp_resized_np.min(), vmax=vmax)
        mapper = cm.ScalarMappable(norm=normalizer, cmap='magma')
        colormapped_im = (mapper.to_rgba(disp_resized_np)[:, :, :3] * 255).astype(np.uint8)
        im = pil.fromarray(colormapped_im)
        name_dest_im = os.path.join(output_directory, "{}_disp.jpeg".format(output_name))
        im.save(name_dest_im)

And here is my code in libtorch

std::pair<torch::Tensor, torch::Tensor> disp_to_depth(const torch::Tensor& disp, float min_depth, float max_depth) {
torch::Tensor min_disp = 1 / max_depth * torch::ones_like(disp);
torch::Tensor max_disp = 1 / min_depth * torch::ones_like(disp);
torch::Tensor scaled_disp = min_disp + (max_disp - min_disp) * disp;
torch::Tensor depth = 1 / scaled_disp;

return std::make_pair(scaled_disp, depth);

}
int main() {
float STEREO_SCALE_FACTOR = 5.4;
std::string image_path = “/home/anamudura/licenta/monodepth2/cpp/assets/test_image.jpg”;
std::string output_directory = “/home/anamudura/licenta/monodepth2/cpp/assets”;
torch::jit::script::Module outputs;
try {
// Deserialize the ScriptModule from a file using torch::jit::load().
outputs = torch::jit::load(“/home/anamudura/licenta/monodepth2/cpp/cpp_models/traced_monodepth_model.pt”);
} catch(const std::exception& e) {
std::cerr << e.what() << ‘\n’;
return -1;
}

// Load and process the image
cv::Mat image = cv::imread("/home/anamudura/licenta/monodepth2/cpp/assets/test_image.jpg");
std::cout << "height: " << image.rows << std::endl;
std::cout << "width: " << image.cols << std::endl;
cv::resize(image, image, cv::Size(640, 192));
torch::Tensor input_tensor = torch::from_blob(image.data, {1, image.rows, image.cols, 3}, torch::kByte);
input_tensor = input_tensor.permute({0, 3, 1, 2}).to(torch::kFloat32) / 255.0;
input_tensor = input_tensor.to(torch::kCUDA);
at::Dict<at::IValue, at::IValue> output_dict = outputs.forward({input_tensor}).toGenericDict();
int original_height = image.rows;
int original_width = image.cols;
torch::Tensor disp = output_dict.at("disp0").toTensor();
at::Tensor disp_resized = torch::nn::functional::interpolate(
    disp,
    torch::nn::functional::InterpolateFuncOptions()
        .size(std::vector<int64_t>({original_height, original_width}))
        .mode(torch::kBilinear)
        .align_corners(false)
);
// Calculate scaled_disp and depth using disp_to_depth function - checked
auto result = disp_to_depth(disp, 0.1, 100);
torch::Tensor scaled_disp = result.first;
torch::Tensor depth = result.second;
std::string output_name = std::filesystem::path(image_path).stem();

std::string name_dest_im = output_directory + "/" + output_name + "_disp.jpeg";

torch::Tensor disp_resized_squeezed = disp_resized.squeeze();
torch::Tensor disp_resized_cpu = disp_resized_squeezed.cpu();
float* disp_resized_data = disp_resized_cpu.data_ptr<float>();
std::size_t numel = disp_resized_cpu.numel();
std::vector<float> sorted_data(disp_resized_data, disp_resized_data + numel);
std::sort(sorted_data.begin(), sorted_data.end());

float vmax = sorted_data[static_cast<std::size_t>(numel * 0.95)];
float vmin = *std::min_element(disp_resized_data, disp_resized_data + numel);

cv::Mat disp_resized_image = cv::Mat(original_height, original_width, CV_8UC1); // Change to single-channel grayscale image
for (int i = 0; i < original_height; ++i) {
    for (int j = 0; j < original_width; ++j) {
        // Map the tensor values to the 0-255 range for grayscale
        uint8_t pixel_value = static_cast<uint8_t>((disp_resized_data[i * original_width + j] - vmin) / (vmax - vmin) * 255);
        disp_resized_image.at<uint8_t>(i, j) = pixel_value;
    }
}
cv::Mat colormapped_im;
cv::applyColorMap(disp_resized_image, colormapped_im, cv::COLORMAP_MAGMA);

std::cout << "Done till now" << std::endl;
cv::imwrite(name_dest_im, colormapped_im);

return 0;

}
On the left is the output in Python and on the right is my output in C++


Did I miss some steps in the preprocessing? Let me know if more information is needed.