like this?
// matImg is a cv::Mat loaded with opencv,and convert to CV_32FC3
// m_model is already loaded
at::Tensor input = torch::from_blob(matImg.data, { 1, 512,512,3 }, torch::kFloat).to(torch::kCUDA);
input = input.permute({ 0,3,1,2 });
input = input.to(torch::kHalf);
m_model->to(torch::kHalf);
auto output = m_model->forward({ input }).toTuple();
at::Tensor arm_loc = output->elements()[0].toTensor().to(torch::kFloat).to(torch::kCUDA);
at::Tensor arm_conf = output->elements()[1].toTensor().to(torch::kFloat).to(torch::kCUDA);
at::Tensor odm_loc = output->elements()[2].toTensor().to(torch::kFloat).to(torch::kCUDA);
at::Tensor odm_conf = output->elements()[3].toTensor().to(torch::kFloat).to(torch::kCUDA);
and then following with post-processing( such as nms) to get the final result of detection.
The amount of time I count includes putting data into cuda, forward(),and post-processing, where loading data and post-processing occpuied about 10ms in 60ms