Consider the C++ example below
#include <torch/script.h>
#include <iostream>
int main(){
double arr1[] = {1., 2., 3., 4., 5. , 6., 1., 2., 3., 4., 5. , 6.};
float arr2[] = {1., 2., 3., 4., 5. , 6., 1., 2., 3., 4., 5. , 6.};
auto torch_double_options = torch::TensorOptions().dtype(torch::kFloat64);
auto torch_single_options = torch::TensorOptions().dtype(torch::kFloat32);
auto tensor1 = torch::from_blob(arr1, 12, torch_double_options).to(torch::kCUDA);
auto tensor2 = torch::from_blob(arr2, 12, torch_single_options).to(torch::kCUDA);
auto tensor1_ptr = tensor1.to(torch::kCPU).data_ptr<double>();
auto tensor2_ptr = tensor2.to(torch::kCPU).data_ptr<float>();
std::cout << "\n";
for (int i = 0; i < 12; i++){
std::cout << *(tensor1_ptr + i) << " " << *(tensor2_ptr + i) << "\n";
}
auto tensor1_cuda = tensor1.to(torch::kCUDA);
auto tensor2_cuda = tensor2.to(torch::kCUDA);
// auto tensor1_cpu = tensor1.to(torch::kCPU);
// auto tensor1_ptr = tensor1_cpu.data_ptr<double>();
// auto tensor2_cpu = tensor2.to(torch::kCPU);
// auto tensor2_ptr = tensor2_cpu.data_ptr<float>();
// std::cout << "\n";
// for (int i = 0; i < 12; i++){
// std::cout << *(tensor1_ptr + i) << " " << *(tensor2_ptr + i) << "\n";
// }
}
Generates the following output
4.99056e-315 0.0846298
1.28505e-316 0
3 5.17494e-38
4 0
5 5
6 6
1 1
2 2
3 3
4 4
5 5
6 6
As you can see it gives garbage values for first two digits of double tensor and 4 digits of float tensor. Hence when data_ptr
is accessed from a tensor residing on GPU in single line to().data_ptr<>()
fashion, it gives mangled results for first 128 bits?
Interestingly, if I split the to
and data_ptr
in two separate lines, the problem disappears (uncomment the commented out line).
ie.
auto tensor1_cpu = tensor1.to(torch::kCPU);
auto tensor1_ptr = tensor1_cpu.data_ptr<double>();
auto tensor2_cpu = tensor2.to(torch::kCPU);
auto tensor2_ptr = tensor2_cpu.data_ptr<float>();
std::cout << "\n";
for (int i = 0; i < 12; i++){
std::cout << *(tensor1_ptr + i) << " " << *(tensor2_ptr + i) << "\n";
}
Gives the proper tensor back.
1 1
2 2
3 3
4 4
5 5
6 6
1 1
2 2
3 3
4 4
5 5
6 6
Also it does not change the original tensor it seems, only pointer is misplaced?
What am I doing wrong?