Memory leak in libtorch even after using torch::NoGradGuard no_grad;

There is a memory leak in libtorch and its occupies lot of space. Here is my code,

#include    <unistd.h>
#include    <iostream>

#include <torch/script.h>
#include <torch/torch.h>
#include <torch/utils.h>

#include    "fReader.h"
//#include <cuda_profiler_api.h>

int main (int argc, char *argv[])
{
   // each of the fReader, fReader1 and  fReader2 occupies 700MB. 
    void    *fReader = NULL;
    void    *fReader1 = NULL;
    void    *fReader2 = NULL;
        
    {
        //torch::NoGradGuard no_grad;
        torch::NoGradGuard no_grad_guard;

        fReader = videoReader::Init ( url1);
        fReader1 = videoReader::Init ( url2 );
        fReader2 = videoReader::Init ( url3) ;
    }
      //// decoding only for one fReader for example
        pOutData        = 0;
        nResult = videoReader::GetVideoFrame (pReader, &pOutData);
        auto options = torch::TensorOptions().device(torch::kCUDA, 0);    
        auto input_tensor = torch::from_blob(pOutData, {1, 640, 360, 3}, options);

    } while (1);
  
    return 0;
}

Here, I am trying to reach camera feed through fReader, which is initializing the camera feed. Its similar to OpenCV, except that it gives output in cuda memory. When I initialise the camera feed without the libtorch library, each of the camera feed takes 200 MB, but initialising the camera feed with libtorch, it takes around 700MB for each initialisation fReader1, fReder2 and fReader3.
So,
GPU memory consumption when initialising the camera feed without libtorch : 600 (3 x 200) MB
GPU memory consumption when initialising the camera feed with libtorch : 2100 (3 x 700) MB

Note that the camera feed gives out as cuda array, which I am converting to torch using torch::from_blob().

Even when I am using torch::NoGradGuard no_grad_guard; there is no change in memory consumption. I am no idea how there is extra memory consumption for each initialisation as every library loaded only once as shared library.