Thank you for your attention!
My simplified version of the code is as follow:
bool segMain(vtkImageData* vtkImageDataPixel,vtkImageData* outImageData, std::string modelPath)
{
auto pixelDataShape = vtkImageDataPixel->GetDimensions();
auto spacing = vtkImageDataPixel->GetSpacing();
int imageDims[3];
vtkImageDataPixel->GetDimensions(imageDims);
short* oriPixelPointer = static_cast<short*>(vtkImageDataPixel->GetScalarPointer()); //get data pointer
auto device = torch::kCUDA; // set cuda
torch::jit::script::Module module; // load model
module = torch::jit::load(modelPath);
module.to(device);
module.eval();
at::set_num_threads(1);
at::Tensor imageTensor = torch::from_blob(oriPixelPointer, { imageDims[2], imageDims[0], imageDims[1] }, torch::kShort).to(torch::kFloat32);
imageTensor = torch::flip(imageTensor, { 0,1,2 });
auto imageTensorNorm = (imageTensor + 1024.0);
at::Tensor modelOutputTensor = torch::ones({ imageDims[2], imageDims[0], imageDims[1] }, torch::kFloat32);
at::Tensor imageTensorSlicer, modelOutputTensorSlicer;
for (int i = 0; i < imageDims[2]; i++)
{
imageTensorSlicer = imageTensorNorm[i].unsqueeze(0).unsqueeze(0);
imageTensorSlicer = imageTensorSlicer.to(device); // predict
modelOutputTensorSlicer = module.forward({ imageTensorSlicer }).toTensor();
modelOutputTensorSlicer = torch::one_hot(torch::argmax(torch::softmax(modelOutputTensorSlicer, 1), 1), 2).permute({ 0, 3, 1, 2 }).detach();
modelOutputTensor[i] = modelOutputTensorSlicer[0][1].data();
}
modelOutputTensor = torch::flip(modelOutputTensor, { 0,1,2 });
modelOutputTensor = modelOutputTensor.to(torch::kCPU);
modelOutputTensor = modelOutputTensor.to(torch::kShort);
vtkSmartPointer<vtkImageData> vtkModelOutputImageData = vtkImageData::New();
vtkModelOutputImageData->SetDimensions(pixelDataShape[0], pixelDataShape[1], pixelDataShape[2]);
vtkModelOutputImageData->SetExtent(0, pixelDataShape[0] - 1, 0, pixelDataShape[1] - 1, 0, pixelDataShape[2] - 1);
vtkModelOutputImageData->AllocateScalars(VTK_SHORT, 1);
vtkModelOutputImageData->SetSpacing(spacing[0], spacing[1], spacing[2]);
short* pixelPointer = static_cast<short*>(vtkModelOutputImageData->GetScalarPointer()); //get data pointer
memcpy(pixelPointer, modelOutputTensor.data_ptr(), sizeof(short) * pixelDataShape[0] * pixelDataShape[1] * pixelDataShape[2]);
c10::cuda::CUDACachingAllocator::emptyCache();
outImageData->DeepCopy(vtkModelOutputImageData);
free(vtkModelOutputImageData);
return true;
}
int main()
{
std::cout << "main func start run ..." << std::endl;
std::string dataDirPath = "D:\\sourceData.mhd";
std::string modelPath = "C:\\net.pt";
auto imageReader = vtkSmartPointer<vtkMetaImageReader>::New();
imageReader->SetFileName(dataDirPath.c_str());
imageReader->Update();
vtkImageData* vtkImageData = imageReader->GetOutput();
auto outImage = vtkImageData::New();
auto r = segMain(vtkImageData, outImage, modelPath);
outImage->Delete();
cin.clear();
cin.sync();
Sleep(10000);
cout << " stop " << endl;
}
I think I have done the two things. but until it run to “cout << " stop " << endl;” , the CPU memory have not been released. And it will take up the same amount of memory, when I run segMain twice.
As far as know, the network parameters and the tensor is similar to a c++ smart pointer. But the CPU memory remains unfreed, even after a long sleep.
Thank you for the response.
Best regards
cm