Segmentation fault in the following code (using C++ API for prediction)

reddy · March 11, 2019, 8:41pm

Using CUDA 10 on linux

Segv occurs at line ‘int idx = top_idxs_a[i];’ in the code below

#include <torch/script.h>
#include "utils.hpp"
#include <opencv2/opencv.hpp>

int main(int argc, char** argv)
{
    if (argc != 4) {
    std::cerr << "usage: test_PyTorch_Cpp_Prediction_GPU <path-to-image> <path-to-exported-script-module> <path-to-labels-file> \n";
    return -1;
  } 

  std::string image_path = argv[1];
  std::string model_path = argv[2];
  std::string labels_path = argv[3];

  int image_height = 224;
  int image_width = 224;
  int n_channels = 3;

  // Read labels
  std::vector<std::string> labels;
  std::string label;
  std::ifstream labelsfile (labels_path);
  if (labelsfile.is_open())
  {
    while (getline(labelsfile, label))
    {
      labels.push_back(label);
    }
    labelsfile.close();
  }

  std::vector<double> mean = {0.485, 0.456, 0.406};
  std::vector<double> std = {0.229, 0.224, 0.225};

  cv::Mat image = cv::imread(image_path, 1);

  // Deserialize the ScriptModule from a file using torch::jit::load().
  std::shared_ptr<torch::jit::script::Module> module = torch::jit::load(model_path);
  assert(module != nullptr);

  module->to(torch::kCUDA);

  // Preprocess image
  image = preprocess(image, image_height, image_width, mean, std);

  std::vector<int64_t> dims = {1, image_height, image_width, n_channels};
  std::vector<int64_t> permute_dims = {0, 3, 1, 2};
  torch::TensorOptions options(torch::kFloat32);
  torch::Tensor image_as_tensor = torch::from_blob(image.data, torch::IntList(dims), options).clone();
  image_as_tensor = image_as_tensor.permute(torch::IntList(permute_dims));
  image_as_tensor = image_as_tensor.toType(torch::kFloat32);
  
  // image_as_tensor = image_as_tensor.to(torch::kCUDA);
  std::vector<torch::jit::IValue> inputs;
  inputs.push_back((image_as_tensor).cuda());
  torch::Tensor output = module->forward(inputs).toTensor();
  // torch::Tensor output = module->forward({image_as_tensor}).toTensor();
  output = output.to(torch::kCPU, false);
  std::cout << output.sizes() << "\n";

  // out tensor sort, print the first 2 category
  std::tuple<torch::Tensor,torch::Tensor> result = output.sort(-1, true);
  torch::Tensor top_scores = std::get<0>(result)[0];
  torch::Tensor top_idxs = std::get<1>(result)[0].toType(torch::kInt32);

  auto top_scores_a = top_scores.accessor<float,1>();
  auto top_idxs_a = top_idxs.accessor<int,1>();
  // std::cout << top_scores_a << "\n";
  // std::cout << top_idxs_a << "\n";
  std::cout << labels.size() << "\n";
  for (int i = 0; i < 5; ++i) {
    int idx = top_idxs_a[i];
    std::cout <<idx << labels[idx] <<"\n";
    std::cout << "top " << i+1 << ", label: "<< labels[idx] << " ";
    std::cout << ", score: " << top_scores_a[i] << std::endl;
    }
}

LeviViana · March 11, 2019, 9:25pm

I think it happens because you are trying to access a value of a vector stored on the device from the host. Run the same operation inside a kernel or pass the top_idxs to the cpu and tell me what happens please.

reddy · March 12, 2019, 3:02am

Thanks @LeviViana. That does fix the segv. But, the output classification result seem way off

$ ./test_PyTorch_Cpp_Prediction_GPU example.JPEG ./…/…/model.pt labels.txt
[1, 1000]
1000
238Greater_Swiss_Mountain_dog
, score: 1.54363ater_Swiss_Mountain_dog
777scabbard
, score: 1.54137bbard
859toaster
, score: 1.47798ster
454bookshop
, score: 1.42607kshop
219cocker_spaniel
, score: 1.4023cker_spaniel

This is the input image:
example

LeviViana · March 12, 2019, 8:50pm

Are you sure that you are loading a pretrained model ? And that you are properly loading its weights ?

Another side question ? Why are you using the C++ API ? If you want to do inference with deep CNN models, the python overhead may not be significant, since the bottleneck will probably be the convolutions, which are done by cuDNN in both API’s (assuming that you use a GPU for inference).

reddy · March 13, 2019, 3:47pm

I am using pretrained models from torchvision.models. So, I believe it should do the right thing
I have been using PyTorch python API for the past few months. Two reasons why I am exploring C++ API:
(a) I was under the impression it will be faster. But, based on your comments it shouldn’t be that different compared to python API
(b) For deployment in production

In-fact, from my experiments I see C++ API is slower than python API for some models I tried:

C++ API, torch JIT

./test_PyTorch_JIT_Cmp
Network: alexnet, Batch-size: 1, Images/Sec: 633.132
Network: vgg-16, Batch-size: 1, Images/Sec: 208.806
Network: resnet-50, Batch-size: 1, Images/Sec: 128.854

Python

$ python test_PyTorch_Cmp.py
Network: alexnet, Batch-size: 1, Images/Sec: 717.5638130431807
Network: vgg-16, Batch-size: 1, Images/Sec: 236.11391642601006
Network: resnet-50, Batch-size: 1, Images/Sec: 127.30998980136953

For reference (partial code below)

Python

dry_run = 5 # use 5 iterations to warm up
    num_batches = 10

    for i in range(dry_run+num_batches):
        if i == dry_run:
            tic = time.time()
        batch_TensorTemp = torch.autograd.Variable(batch_Tensor)
        if dev == "gpu":
            # move tensor to GPU
            batch_TensorTemp = batch_TensorTemp.cuda()
        output_batch = model(batch_TensorTemp)
        if dev == "gpu":
            # move ouput to cpu
            output_batch = output_batch.data.cpu()
    end = time.time()

    print("Network: {}, Batch-size: {}, Images/Sec: {}\n".format(network, batch_size, (num_batches*batch_size/(end - tic))))

C++

int dry_run = 5;
        int num_batches = 10;

        for(int iter = 0; iter < dry_run + num_batches; ++iter){
          if(iter == dry_run){
            cudaEventRecord(start);
          }
          std::vector<torch::jit::IValue> inputs;
          inputs.push_back((image_as_tensor).cuda());
          torch::Tensor output = module->forward(inputs).toTensor();
          output = output.to(torch::kCPU);    
        }
        cudaEventRecord(stop);
    
        cudaEventSynchronize(stop);
        cudaEventElapsedTime(&milliseconds, start, stop);
        seconds = milliseconds/1000;
        float throughPut = num_batches*batchSize/seconds;
        std::cout << "Network: " << networkName << ", Batch-size: " << batchSize << ", Images/Sec: " << throughPut << "\n";

dambo · June 15, 2019, 5:28pm

Did you try using batches when comparing the results between C++ and Python?