Torchlib C++ GPU memory is not released after inference function is called

I am trying to build torchlib c++ shared library using cmake for using it later in python. My c++ class is as follows:

#include <torch/torch.h>
#include <torch/script.h>
// This header is what defines the custom class registration
// behavior specifically. script.h already includes this, but
// we include it here so you know it exists in case you want
// to look at the API or implementation.
#include <torch/custom_class.h>

#include <string>
#include <vector>
#include <ATen/ATen.h>
#include <cuda_runtime_api.h>
#include <cuda.h>

#include <c10/cuda/CUDACachingAllocator.h>

template <class T>
struct MyStackClass : torch::CustomClassHolder {
  std::vector<T> stack_;
  torch::jit::script::Module module;
  MyStackClass(std::string init) {
    try {
        module = torch::jit::load(init, torch::kCUDA);
      catch (const c10::Error& e) {
        std::cerr << "error loading the face model\n";
  /* function definition */  
  at::Tensor get_face_feat(torch::Tensor& input_tensor) {
      std::vector<torch::jit::IValue> inputs;
      inputs.push_back(input_tensor); //.to(at::kCUDA));
      // Execute the model and turn its output into a tensor.
      at::Tensor output = module.forward(inputs).toTensor();
      output =;
      return output;


TORCH_LIBRARY(my_classes, m) {
    .def("get_face_feat", &MyStackClass<std::string>::get_face_feat)

From cmake I can build the library

cmake_minimum_required(VERSION 3.1 FATAL_ERROR)

find_package(Torch REQUIRED )

find_package( CUDA REQUIRED)

# Define our library target
add_library(custom_class SHARED class.cpp)
# Link against LibTorch
target_link_libraries(custom_class "${TORCH_LIBRARIES}")

In python I can use the library as follows:

import torch
import cv2
from torchvision import transforms
import sys
# `torch.classes.load_library()` allows you to pass the path to your .so file
# to load it in and make the custom C++ classes available to both Python and
# TorchScript
# You can query the loaded libraries like this:
s = torch.classes.my_classes.MyStackClass("<path_to_checkpoint.pth>")
im = cv2.imread('<path_to_image.jpg>')

transform = transforms.Compose([ ...  ])
normface = cv2.resize(im, (112,112))
normface = transform(normface)
normface = torch.unsqueeze(normface, 0)

feat = s.get_face_feat('cuda'))  

Now the problem is that GPU memory is not being released after the last line feat = s.get_face_feat(‘cuda’)) and gpu memory keeps on increasing on each call. Any guidance is appreciated.