Hi everyone,

This is my part of my code but I have a problem that when I want to get data it takes ~90ms. where is the problem?

at::Tensor image_tensor = torch::from_blob(

input_preproc.data, { 1, _img_h, _img_w, _img_d }, at::kFloat);

image_tensor = image_tensor.to(*_device);

image_tensor = image_tensor.permute({ 0, 3, 1, 2 });

image_tensor[0][2] =

image_tensor[0][0].div(255).sub(_img_means[0]).div(_img_stds[0]);

image_tensor[0][1] =

image_tensor[0][1].div(255).sub(_img_means[1]).div(_img_stds[1]);

image_tensor[0][0] =

image_tensor[0][2].div(255).sub(_img_means[2]).div(_img_stds[2]);

std::vectortorch::jit::IValue inputs;

inputs.push_back(image_tensor);

torch::Tensor logits_tensor = _module->forward(inputs).toTensor();

torch::Tensor argmax_tensor = logits_tensor.argmax(1, false);

argmax_tensor = argmax_tensor.toType(at::kInt);

// it takes about ~90ms

auto begin = std::chrono::high_resolution_clock::now();

**argmax_tensor = argmax_tensor.to(torch::kCPU);**

auto end = std::chrono::high_resolution_clock::now();

OS: Win 10

GPU: GTX 1080Ti

CUDA: 10.1