Inception_v3 Thread execution failed after warm-up

I got the following error when running Inception_v3 multithreading after 4 warm-ups in libtorch(c++) environment.

terminate called after throwing an instance of ‘std::runtime_error’
what(): The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: The following operation failed in the TorchScript interpreter.
Traceback of TorchScript (most recent call last):
RuntimeError: CUDA driver error: invalid device context

What do I do?

Is the GPU still available after the run?
If so, could you update to the latest nightly release and rerun the script?

@ptrblck
I used torch 1.8.1, but the problem still remains.


#include <cuda_runtime.h>
#include <ATen/cuda/CUDAContext.h>
#include <c10/cuda/CUDAGuard.h>
#include <ATen/cuda/CUDAMultiStreamGuard.h>
#include <ATen/cuda/CUDAEvent.h>
#include <c10/cuda/CUDAStream.h>
#include <pthread.h>
#include <torch/script.h>
#include <torch/torch.h>
#include <typeinfo>
#include <iostream>
#include <inttypes.h>
#include <functional>
#include <memory>
#include <stdlib.h>
#include <c10/cuda/CUDAFunctions.h>
#include <cuda_profiler_api.h>
#include <limits.h>
#include <time.h>
#include <sys/time.h>

#include <cuda_runtime.h>
#include <thread>

#include <limits.h>
#include <time.h>
#include <sys/time.h>
#include <algorithm>


#define n_dense 0
#define n_res 0
#define n_alex 0
#define n_vgg 0
#define n_wide 0
#define n_squeeze 0
#define n_mobile 0
#define n_mnasnet 0
#define n_inception 2
#define n_shuffle 0
#define n_resX 0

#define WARMING 4

namespace F = torch::nn::functional;
using namespace std;

c10::DeviceIndex GPU_NUM = 0;

vector<torch::jit::IValue> inputs;
vector<torch::jit::IValue> inputs2;
torch::Device device = {at::kCUDA,GPU_NUM};
torch::Tensor x = torch::ones({1, 3, 224, 224}).to(device);
torch::Tensor x2 = torch::ones({1, 3, 299, 299}).to(device);

typedef struct _net
{
	torch::jit::script::Module module;
	std::vector<torch::jit::IValue> input;
	std::string name;	//network name
	int index_n; //network index
	int n_all; // all network num
}Net;

double what_time_is_it_now()
{
    struct timeval time;
    if (gettimeofday(&time,NULL)){
        return 0;
    }
    return (double)time.tv_sec + (double)time.tv_usec * .000001;
}

const int n_all = n_alex + n_vgg + n_res + n_dense + n_wide + n_squeeze + n_mobile + n_mnasnet + n_inception + n_shuffle + n_resX;

double *start_time;
double *end_time;

void *network_predict(Net *net){

  start_time[net->index_n] = what_time_is_it_now();
  at::Tensor out;
  out = net->module.forward(net->input).toTensor();
  end_time[net->index_n] = what_time_is_it_now();
  std::cout << out.slice(/*dim=*/1, /*start=*/0, /*end=*/15) << "\n";	
  std::cout << "\n***** "<<net->name<<" EXECUTION TIME : "<< end_time[net->index_n]-start_time[net->index_n] <<"s ***** \n";
}



int main(){    
    start_time = (double *)malloc(sizeof(double)*n_all);
    end_time = (double *)malloc(sizeof(double)*n_all);
    torch::jit::script::Module model[11];
    model[0] = torch::jit::load("../densenet201_model.pt");
    model[0].to(device);
    model[1] = torch::jit::load("../resnet152_model.pt");
    model[1].to(device);
    model[2] = torch::jit::load("../alexnet_model.pt");
    model[2].to(device);
    model[3] = torch::jit::load("../vgg_model.pt");
    model[3].to(device);
    model[4] = torch::jit::load("../wideresnet_model.pt");
    model[4].to(device);
    model[5] = torch::jit::load("../squeeze_model.pt");
    model[5].to(device);
    model[6] = torch::jit::load("../mobilenet_model.pt");
    model[6].to(device);
    model[7] = torch::jit::load("../mnasnet_model.pt");
    model[7].to(device);
    model[8] = torch::jit::load("../inception_model.pt");
    model[8].to(device);
    model[9] = torch::jit::load("../shuffle_model.pt");
    model[9].to(device);
    model[10] = torch::jit::load("../resnext_model.pt");
    model[10].to(device);

    pthread_t networkArray_dense[n_dense];
    pthread_t networkArray_res[n_res];
    pthread_t networkArray_alex[n_alex];
    pthread_t networkArray_vgg[n_vgg];
    pthread_t networkArray_wide[n_wide];
    pthread_t networkArray_squeeze[n_squeeze];
    pthread_t networkArray_mobile[n_mobile];
    pthread_t networkArray_mnasnet[n_mnasnet];
    pthread_t networkArray_inception[n_inception];
    pthread_t networkArray_shuffle[n_shuffle];
    pthread_t networkArray_resX[n_resX];

    Net net_input_dense[n_dense];
    Net net_input_res[n_res];
    Net net_input_alex[n_alex];
    Net net_input_vgg[n_vgg];
    Net net_input_wide[n_wide];
    Net net_input_squeeze[n_squeeze];
    Net net_input_mobile[n_mobile];
    Net net_input_mnasnet[n_mnasnet];
    Net net_input_inception[n_inception];
    Net net_input_shuffle[n_shuffle];
    Net net_input_resX[n_resX];

    inputs.push_back(x);
    auto x_ch0 = torch::unsqueeze(x2.index({torch::indexing::Slice(), 0}), 1) * (0.229 / 0.5) + (0.485 - 0.5) / 0.5;
    auto x_ch1 = torch::unsqueeze(x2.index({torch::indexing::Slice(), 1}), 1) * (0.224 / 0.5) + (0.456 - 0.5) / 0.5;
    auto x_ch2 = torch::unsqueeze(x2.index({torch::indexing::Slice(), 2}), 1) * (0.225 / 0.5) + (0.406 - 0.5) / 0.5;
      
    x_ch0.to(device);
    x_ch1.to(device);
    x_ch2.to(device);

    auto x_cat = torch::cat({x_ch0,x_ch1,x_ch2},1).to(device);
    inputs2.clear();
    inputs2.push_back(x_cat);

    for(int i=0;i<n_dense;i++){
      net_input_dense[i].module = model[0];
      net_input_dense[i].input = inputs;
      net_input_dense[i].name = "DenseNet";
      net_input_dense[i].index_n = i;
      for(int j=0;j<WARMING;j++){
          network_predict(&net_input_dense[i]);
          cudaDeviceSynchronize();
          net_input_dense[i].input = inputs;
        }
    }

    for(int i=0;i<n_res;i++){
	  net_input_res[i].module = model[1];
    net_input_res[i].name = "ResNet";
	  net_input_res[i].input = inputs;
    net_input_res[i].index_n = i+n_dense;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_res[i]);
        cudaDeviceSynchronize();
        net_input_res[i].input = inputs;
      }
  }

  for(int i=0;i<n_alex;i++){
    net_input_alex[i].module = model[2];
    net_input_alex[i].index_n = i+ n_res + n_dense;
	  net_input_alex[i].input = inputs;
    net_input_alex[i].name = "AlexNet";
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_alex[i]);
        cudaDeviceSynchronize();
        net_input_alex[i].input = inputs;
      }
  }

  for(int i=0;i<n_vgg;i++){
    net_input_vgg[i].module = model[3];
	  net_input_vgg[i].input = inputs;
    net_input_vgg[i].name = "VGG";
    net_input_vgg[i].index_n = i + n_alex + n_res + n_dense;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_vgg[i]);
        cudaDeviceSynchronize();
        net_input_vgg[i].input = inputs;
    }
  }

  for(int i=0;i<n_wide;i++){
    net_input_wide[i].module = model[4];
	  net_input_wide[i].input = inputs;
    net_input_wide[i].name = "WideResNet";
    net_input_wide[i].index_n = i+n_alex + n_res + n_dense + n_vgg;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_wide[i]);
        cudaDeviceSynchronize();
        net_input_wide[i].input = inputs;
    }
  }

  for(int i=0;i<n_squeeze;i++){
      net_input_squeeze[i].module = model[5];
    net_input_squeeze[i].name = "SqueezeNet";
	  net_input_squeeze[i].input = inputs;
    net_input_squeeze[i].index_n = i + n_alex + n_res + n_dense + n_vgg + n_wide;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_squeeze[i]);
        cudaDeviceSynchronize();
        net_input_squeeze[i].input = inputs;
    }
  }

  for(int i=0;i<n_mobile;i++){
    net_input_mobile[i].module = model[6];
	  net_input_mobile[i].input = inputs;
    net_input_mobile[i].name = "Mobile";
    net_input_mobile[i].index_n = i + n_alex + n_res + n_dense + n_vgg + n_wide + n_squeeze;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_mobile[i]);
        cudaDeviceSynchronize();
    }
  }

  for(int i=0;i<n_mnasnet;i++){
    net_input_mnasnet[i].module = model[7];
	  net_input_mnasnet[i].input = inputs;
    net_input_mnasnet[i].name = "MNASNet";
    net_input_mnasnet[i].index_n = i + n_alex + n_res + n_dense + n_vgg + n_wide + n_squeeze + n_mobile;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_mnasnet[i]);
        cudaDeviceSynchronize();
    }
  }

  for(int i=0;i<n_inception;i++){
    net_input_inception[i].module = model[8];     
    net_input_inception[i].input = inputs2;
    net_input_inception[i].name = "Inception_v3";
    net_input_inception[i].index_n = i + n_alex + n_res + n_dense + n_vgg + n_wide + n_squeeze + n_mobile + n_mnasnet;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_inception[i]);
        cudaDeviceSynchronize();
    }
  }

  for(int i=0;i<n_shuffle;i++){
	  net_input_shuffle[i].module = model[9];
	  net_input_shuffle[i].input = inputs;
    net_input_shuffle[i].name = "ShuffleNet";
    net_input_shuffle[i].index_n = i + n_alex + n_res + n_dense + n_vgg + n_wide + n_squeeze + n_mobile + n_mnasnet + n_inception;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_shuffle[i]);
        cudaDeviceSynchronize();

    }
  }

  for(int i=0;i<n_resX;i++){
    net_input_resX[i].module = model[10];
	  net_input_resX[i].input = inputs;
    net_input_resX[i].name = "ResNext";
    net_input_resX[i].index_n = i + n_alex + n_res + n_dense + n_vgg + n_wide + n_squeeze + n_mobile + n_mnasnet + n_inception + n_shuffle;
    for(int j=0;j<WARMING;j++){
        network_predict(&net_input_resX[i]);
        cudaDeviceSynchronize();
        net_input_resX[i].input = inputs;
    }
  }
    //cudaProfilerStart();
    cudaDeviceSynchronize();
    double time1 = what_time_is_it_now();
    //cudaProfilerStart();

    for(int i=0;i<n_dense;i++){
    if (pthread_create(&networkArray_dense[i], NULL, (void *(*)(void*))network_predict, &net_input_dense[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }
  for(int i=0;i<n_res;i++){
    if (pthread_create(&networkArray_res[i], NULL, (void *(*)(void*))network_predict, &net_input_res[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }
  for(int i=0;i<n_alex;i++){
    if (pthread_create(&networkArray_alex[i], NULL, (void *(*)(void*))network_predict, &net_input_alex[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }
  for(int i=0;i<n_vgg;i++){
	  if (pthread_create(&networkArray_vgg[i], NULL, (void *(*)(void*))network_predict, &net_input_vgg[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }
  for(int i=0;i<n_wide;i++){
    if (pthread_create(&networkArray_wide[i], NULL, (void *(*)(void*))network_predict, &net_input_wide[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }

  for(int i=0;i<n_squeeze;i++){
    if (pthread_create(&networkArray_squeeze[i], NULL, (void *(*)(void*))network_predict, &net_input_squeeze[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }

  for(int i=0;i<n_mobile;i++){
    if (pthread_create(&networkArray_mobile[i], NULL, (void *(*)(void*))network_predict, &net_input_mobile[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }

  for(int i=0;i<n_mnasnet;i++){
    if (pthread_create(&networkArray_mnasnet[i], NULL, (void *(*)(void*))network_predict, &net_input_mnasnet[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }

  for(int i=0;i<n_inception;i++){
    if (pthread_create(&networkArray_inception[i], NULL, (void *(*)(void*))network_predict, &net_input_inception[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }
  
  for(int i=0;i<n_shuffle;i++){
    if (pthread_create(&networkArray_shuffle[i], NULL, (void *(*)(void*))network_predict, &net_input_shuffle[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }
  for(int i=0;i<n_resX;i++){
    if (pthread_create(&networkArray_resX[i], NULL, (void *(*)(void*))network_predict, &net_input_resX[i]) < 0){
      perror("thread error");
      exit(0);
    }
  }

  for (int i = 0; i < n_dense; i++){
    pthread_join(networkArray_dense[i], NULL);
  }
  for (int i = 0; i < n_res; i++){
    pthread_join(networkArray_res[i], NULL);
  }
  for (int i = 0; i < n_alex; i++){
    pthread_join(networkArray_alex[i], NULL);
  }
  for (int i = 0; i < n_vgg; i++){
    pthread_join(networkArray_vgg[i], NULL);
  }
  for (int i = 0; i < n_wide; i++){
    pthread_join(networkArray_wide[i], NULL);
  }
  for (int i = 0; i < n_squeeze; i++){
    pthread_join(networkArray_squeeze[i], NULL);
  }
  for (int i = 0; i < n_mobile; i++){
    pthread_join(networkArray_mobile[i], NULL);
  }
  for (int i = 0; i < n_mnasnet; i++){
    pthread_join(networkArray_mnasnet[i], NULL);
  }
  for (int i = 0; i < n_inception; i++){
    pthread_join(networkArray_inception[i], NULL);
  }
  for (int i = 0; i < n_shuffle; i++){
    pthread_join(networkArray_shuffle[i], NULL);
  }
  for (int i = 0; i < n_resX; i++){
    pthread_join(networkArray_resX[i], NULL);
  }
  double time2 = what_time_is_it_now();

	std::cout << "\n***** TOTAL EXECUTION TIME : "<<time2-time1<<"s ***** \n";
}

If I don’t warm-up, there’s no error.
The inception only has this problem