I have a problem getting the resulting image partitioned when i use c++ frontend,,,

As shown above, I always get the resulting image divided into 9 equal parts.
This is really annoying me. Why is this happening,?

torch::Tensor Mat_To_Tensor(Mat mat)
    std::cout << "converting mat to tensor\n";

    mat.convertTo(mat, CV_32FC3, 1.0 / 255.0f);

    torch::Tensor tensor = torch::from_blob(mat.data, { mat.channels(), mat.rows, mat.cols }, torch::kFloat);
    tensor = tensor.unsqueeze_(0);
    cout << "tensor sizes : " << tensor.sizes() << endl;
    tensor = tensor.to(torch::kCUDA);

    return tensor;

Mat Tensor_To_Mat(torch::Tensor tensor)
    std::cout << "converting tensor to mat\n";

    tensor = tensor.to(torch::kCPU);
    tensor = tensor.squeeze(0).detach().permute({ 1, 2, 0 });
    tensor = tensor.mul(255).clamp(0, 255).to(torch::kU8);

    int64_t height = tensor.size(0);
    int64_t width = tensor.size(1);
    Mat mat(height, width, CV_8UC3);
    std::memcpy((void*)mat.data, tensor.data_ptr(), sizeof(torch::kU8) * tensor.numel());

    return mat.clone();

int main() {
    const string model_path = "../models/net_22.pt";
    const string img_path = "../data/input3.png";
    torch::jit::script::Module Net = torch::jit::load(model_path);

    Mat im = imread(img_path, IMREAD_COLOR);
    torch::Tensor tensor = Mat_To_Tensor(im);

    std::vector<torch::jit::IValue> inputs;

    torch::Tensor output = Net.forward(inputs).toTensor();
    output = torch::exp(tensor) - 1 + output;

    Mat res = Tensor_To_Mat(output);
    imwrite("../data/res3.png", res);

    return 0;

This is my code.
net_22.pt is trained script model by pytorch.
please help me,

OpenCV uses the channels-last memory format, so using:

torch::Tensor tensor = torch::from_blob(mat.data, { mat.channels(), mat.rows, mat.cols }, torch::kFloat);

would create an interleaved tensor and you should either permute the OpenCV matrix or create the tensor in a channels-last format and permute it afterwards.