Why libtorch cannot inference after cudaDeviceReset

Assassintears · November 3, 2023, 6:50am

Hi, I use Libtorch to deploy my project. Sometimes, I want to reload model to detect objects, then I call cudaDeviceReset to release resources. My code like this:

// 
/****************
some preprocess
****************/
#region load model and detect
// load model
m_device = torch::Device(torch::kCUDA);
m_model = std::make_unique<torch::jit::Module>(torch::jit::load(modelFile));
m_model->to(m_device);


// read images with opencv
cv::Mat mat = cv::imread("xxxx.bmp");
cvtColor(mat , mat , cv::COLOR_GRAY2RGB);
// some other process


torch::Tensor inputs = torch::from_blob(mat.data,
				{ 1,imgRow,imgCol,3 }, torch::kByte);

			inputs = inputs.permute({ 0, 3, 1, 2 });
			inputs = inputs.toType(torch::kFloat32);
			inputs = inputs.div(255);
			// tensor to device(GPU)
			inputs = inputs.to(m_device);


			if (nullptr == m_model)
			{
				return -1;
			}
// this works well
auto out = m_model->forward({ inputs }).toTuple();

#endregion



#region release model
// release model
if (nullptr != m_model)
{
	m_model->to(kCPU);
	m_model.release();
	m_model = nullptr;

	c10::cuda::CUDACachingAllocator::CUDAAllocator* pCudaAllocator = c10::cuda::CUDACachingAllocator::get();

	if (nullptr != pCudaAllocator)
	{
		c10::cuda::CUDACachingAllocator::emptyCache();
	}
}
	cudaDeviceReset();
	cudaDeviceSynchronize();
#endregion

// after release model, recall the code `load model and detect`

Well, the above code shows the main procedure, after call the code above 3 times, the m_model->forward({ inputs }).toTuple(); will throw an exception, if I donnot call cudaDeviceReset(); cudaDeviceSynchronize();, it will works well.

I want to know why calling cudaDeviceReset multitimes will raise an inference problem, please help me to resolve this problem, thank you very much.

ptrblck · November 3, 2023, 12:43pm

Why do you want to reset the GPU and which error are you seeing? Are you also resetting libtorch and initialize it again afterwards?