Lock libtorch thread

Hi guys,

I am loading weights and performing only the feedforward step using libtorch in C++. Since I am on CPU it comes out that this prediction step is slower than the next iteration of the outer while loop. The scenario is the following:

while (...){
  output = torch_predict(input);
  display_results(output);
}

Very often output is empty, or not complete. If I add a sleep on my executable this appear less frequently. I guess is a problem related to the different thread(s) in which libtorch is operating. Is there any way I can use a mutex? In case I can know anyhow when libtorch finished is computation for the current iteration. A solution could be migrating to GPU?

Thanks in advance

If output is empty it seems that either the data was deleted already or never filled properly.
Could you post an executable code snippet, which reproduces this issue, as it might be a bug in the backend?

Thanks for your reply, here a simplification of the executable

struct Net : torch::nn::Module {
  Net():
        conv1a(torch::nn::Conv2dOptions( 1, 64, 3).stride(1).padding(1)),
        conv1b(torch::nn::Conv2dOptions(64, 64, 3).stride(1).padding(1)),
        conv2a(torch::nn::Conv2dOptions(64, 64, 3).stride(1).padding(1)),
        conv2b(torch::nn::Conv2dOptions(64, 64, 3).stride(1).padding(1)),
        conv3a(torch::nn::Conv2dOptions(64, 128, 3).stride(1).padding(1)),
        conv3b(torch::nn::Conv2dOptions(128, 128, 3).stride(1).padding(1)),
        conv4a(torch::nn::Conv2dOptions(128, 256, 3).stride(1).padding(1)),
        conv4b(torch::nn::Conv2dOptions(256, 256, 3).stride(1).padding(1)),
        convPa(torch::nn::Conv2dOptions(256, 256, 3).stride(1).padding(1)),
        convPb(torch::nn::Conv2dOptions(256, 65, 1).stride(1).padding(0)),
        convDa(torch::nn::Conv2dOptions(256, 256, 3).stride(1).padding(1)),
        convDb(torch::nn::Conv2dOptions(256, 256, 1).stride(1).padding(0)){
        
    
    register_module("conv1a", conv1a);
    register_module("conv1b", conv1b);
    register_module("conv2a", conv2a);
    register_module("conv2b", conv2b);
    register_module("conv3a", conv3a);
    register_module("conv3b", conv3b);
    register_module("conv4a", conv4a);
    register_module("conv4b", conv4b);
    register_module("convPa", convPa);
    register_module("convPb", convPb);
    register_module("convDa", convDa);
    register_module("convDb", convDb);
  }
  
  std::vector<torch::Tensor> forward(torch::Tensor x){
    x = torch::relu(conv1a->forward(x));
    x = torch::relu(conv1b->forward(x));
    x = torch::max_pool2d(x, 2, 2);

    x = torch::relu(conv2a->forward(x));
    x = torch::relu(conv2b->forward(x));
    x = torch::max_pool2d(x, 2, 2);

    x = torch::relu(conv3a->forward(x));
    x = torch::relu(conv3b->forward(x));
    x = torch::max_pool2d(x, 2, 2);

    x = torch::relu(conv4a->forward(x));
    x = torch::relu(conv4b->forward(x));

    auto cPa = torch::relu(convPa->forward(x));
    auto semi = convPb->forward(cPa);  // [B, 65, H/8, W/8]

    auto cDa = torch::relu(convDa->forward(x));
    auto desc = convDb->forward(cDa);  // [B, d1, H/8, W/8]

    auto dn = torch::norm(desc, 2, 1);
    desc = desc.div(torch::unsqueeze(dn, 1));

    semi = torch::softmax(semi, 1);
    semi = semi.slice(1, 0, 64);
    semi = semi.permute({0, 2, 3, 1});  // [B, H/8, W/8, 64]

    int Hc = semi.size(1);
    int Wc = semi.size(2);
    semi = semi.contiguous().view({-1, Hc, Wc, 8, 8});
    semi = semi.permute({0, 1, 3, 2, 4});
    semi = semi.contiguous().view({-1, Hc * 8,   Wc * 8}); // [B, H, W]

    std::vector<torch::Tensor> ret;
    ret.resize(2);
    ret[0] = semi;
    ret[1] = desc;
    return ret;
  }
  
  torch::nn::Conv2d conv1a;
  torch::nn::Conv2d conv1b;
  torch::nn::Conv2d conv2a;
  torch::nn::Conv2d conv2b;
  torch::nn::Conv2d conv3a;
  torch::nn::Conv2d conv3b;
  torch::nn::Conv2d conv4a;
  torch::nn::Conv2d conv4b;
  torch::nn::Conv2d convPa;
  torch::nn::Conv2d convPb;
  torch::nn::Conv2d convDa;
  torch::nn::Conv2d convDb;
};

torch::Tensor predict(const cv::Mat& img, std::shared_ptr<Net> model, bool cuda){
        if(model == nullptr)
            throw std::runtime_error("Predictor::detect|ERROR, model not set correctly!");
        
        torch::NoGradGuard no_guard;
        torch::Tensor img_tensor = torch::from_blob(img.clone().data, { 1, 1, img.rows, img.cols }, torch::kFloat32).clone();
        
        bool use_cuda = cuda && torch::cuda::is_available();
        torch::DeviceType device_type;
        if (use_cuda)
            device_type = torch::kCUDA;
        else
            device_type = torch::kCPU;
        
        torch::Device device(device_type);
        model->to(device);
        img_tensor = img_tensor.set_requires_grad(false);

        auto out = model->forward(img_tensor.to(device));
        _mProb = out[0].squeeze(0); // [H, W]
        // _mDesc = out[1]; // [1, 256, H/8, W/8]
}

int main(){
   std::shared_ptr<Net> model(new Net);
   torch::load(model, "path/to/weigths");
    while(...){
         // generate img and check that it is not corrupted
         torch::Tensor probs = detect(cv_img, model, false);
         // do stuff on probs
         // std::this_thread::sleep_for(std::chrono::milliseconds(sleep_for));
     }
}

prob return of the predict function, is not always populated correctly, I’ve already checked that the input cv::Mat is ok before passing this to the predict function. If I add the sleep commented in the above code, this behaviour is less frequent.

Thank you for your help