single image forward:
std::vectorstd::vector
model4_dec::Run(const cv::Mat& img, float conf_threshold, float iou_threshold) {
torch::NoGradGuard no_grad;
std::cout << “----------New Frame----------” << std::endl;
// TODO: check_img_size()
/*** Pre-process ***/
auto start = std::chrono::high_resolution_clock::now();
// keep the original image for visualization purpose
cv::Mat img_input = img.clone();
std::vector<float> pad_info;
pad_info = LetterboxImage(img_input, img_input, cv::Size(INPUT_W, INPUT_H));
const float pad_w = pad_info[0];
const float pad_h = pad_info[1];
const float scale = pad_info[2];
cv::cvtColor(img_input, img_input, cv::COLOR_BGR2RGB); // BGR -> RGB
img_input.convertTo(img_input, CV_32FC3, 1.0f / 255.0f); // normalization 1/255
auto tensor_img = torch::from_blob(img_input.data, { 1, img_input.rows, img_input.cols, img_input.channels() }).to(device_);
tensor_img = tensor_img.permute({ 0, 3, 1, 2 }).contiguous(); // BHWC -> BCHW (Batch, Channel, Height, Width)
if (half_) {
tensor_img = tensor_img.to(torch::kHalf);
}
std::vector<torch::jit::IValue> inputs;
inputs.emplace_back(tensor_img);
auto end = std::chrono::high_resolution_clock::now();
auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
// It should be known that it takes longer time at first time
std::cout << "pre-process takes : " << duration.count() << " ms" << std::endl;
/*** Inference ***/
// TODO: add synchronize point
start = std::chrono::high_resolution_clock::now();
// inference
torch::jit::IValue output = module_.forward(inputs);
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
// It should be known that it takes longer time at first time
std::cout << "inference takes : " << duration.count() << " ms" << std::endl;
/*** Post-process ***/
start = std::chrono::high_resolution_clock::now();
auto detections = output.toTuple()->elements()[0].toTensor();
// result: n * 7
// batch index(0), top-left x/y (1,2), bottom-right x/y (3,4), score(5), class id(6)
auto result = PostProcessing(detections, pad_w, pad_h, scale, img.size(), conf_threshold, iou_threshold);
end = std::chrono::high_resolution_clock::now();
duration = std::chrono::duration_cast<std::chrono::milliseconds>(end - start);
// It should be known that it takes longer time at first time
std::cout << "post-process takes : " << duration.count() << " ms" << std::endl;
return result;
}
but i want to use bach of image to forward:
std::vectorstd::vector model4_dec::tensor_run(std::vectorcv::Mat& _vecimg, float conf_threshold, float iou_threshold)
{
torch::NoGradGuard no_grad;
float scale = 1;// std::min(out_w / in_w, out_h / in_h);
int imgwidth = INPUT_W;
int imgheight = INPUT_H;
static float data[BATCH_SIZE * 3 * INPUT_H * INPUT_W];
for (int b = 0; b < _vecimg.size(); b++)
{
// keep the original image for visualization purpose
cv::Mat img_input = _vecimg[b].clone();
imgwidth = img_input.cols;
imgheight = img_input.rows;
scale = std::min(INPUT_W / img_input.cols, INPUT_H / img_input.rows);
if (img_input.empty()) continue;
cv::Mat pr_img = preprocess_img(img_input, INPUT_W, INPUT_H); // letterbox BGR to RGB
pr_img.convertTo(pr_img, CV_32FC3, 1.0f / 255.0f); // normalization 1/255
int i = 0;
for (int row = 0; row < INPUT_H; ++row) {
uchar* uc_pixel = pr_img.data + row * pr_img.step;
for (int col = 0; col < INPUT_W; ++col) {
data[b * 3 * INPUT_H * INPUT_W + i] = (float)uc_pixel[1] / 255.0;
data[b * 3 * INPUT_H * INPUT_W + i + INPUT_H * INPUT_W] = (float)uc_pixel[1] / 255.0;
data[b * 3 * INPUT_H * INPUT_W + i + 2 * INPUT_H * INPUT_W] = (float)uc_pixel[0] / 255.0;
uc_pixel += 3;
++i;
}
}
}
auto tensor_img = torch::from_blob(data, { BATCH_SIZE, INPUT_H, INPUT_W, 3 }).to(device_);
tensor_img = tensor_img.permute({ 0, 3, 1, 2 }).contiguous(); // BHWC -> BCHW (Batch, Channel, Height, Width)
if (half_) {
tensor_img = tensor_img.to(torch::kHalf);
}
std::vector<torch::jit::IValue> inputs;
inputs.emplace_back(tensor_img);
// inference
torch::jit::IValue output = module_.forward(inputs);
/*** Post-process ***/
auto detections = output.toTuple()->elements()[0].toTensor();
// result: n * 7
// batch index(0), top-left x/y (1,2), bottom-right x/y (3,4), score(5), class id(6)
auto result = PostProcessing(detections, 0, 0, scale, cv::Size(imgwidth, imgheight), conf_threshold, iou_threshold);
return result;
}