Network module memory is not released in C++ libtorch

I tested the example code in this issue https://github.com/pytorch/pytorch/issues/106255

I fixed forward as below

torch::Tensor forward(torch::Tensor x)
    {
        x = fc->forward(x);
        return x;
    }

Here is the result

Memory used: 382
Memory used: 766
Memory used: 1362
Memory used: 1362
Memory used: 978

Unlike the example code, Even after clearing the cache, some memory remains.

I just tried

cudaFree(input.data_ptr());
cudaFree(output.data_ptr());

result is

Memory used: 214

Still remains something Where is this from and how can I release this memory?

(Full code)

#include <torch/torch.h>
#include <c10/cuda/CUDACachingAllocator.h>
#include <iostream>

using namespace std;

class Net : public torch::nn::Module
{
public:
    Net(int fc_size = 100)
    {
        fc = torch::nn::Linear(fc_size, fc_size);
        fc->to(torch::kCUDA);
        register_module("fc", fc);
    }

    torch::Tensor forward(torch::Tensor x)
    {
        x = fc->forward(x);
        return x;
    }

private:
    torch::nn::Linear fc = nullptr;
    torch::Tensor x_copy;
};

double get_cuda_memory_usage()
{
    size_t free_mem, total_mem;
    cudaMemGetInfo(&free_mem, &total_mem);
    size_t used_mem = total_mem - free_mem;

    double used_mem_mb = used_mem / (1024.0 * 1024.0);
    return used_mem_mb;
}

void print_cuda_memory_diff(double start_mem, double end_mem)
{
    cout << "Memory used: " << end_mem - start_mem << endl;
}

void memory_test(int size, double start_memory)
{
    // Create an empty tensor
    auto input = torch::empty({size, size}).to(torch::kCUDA);
    // GPU Memory: 1 * (input)
    print_cuda_memory_diff(start_memory, get_cuda_memory_usage());

    // Create a network with same size of tensor
    Net *net = new Net(size);
    // GPU Memory: 2 * (input)
    print_cuda_memory_diff(start_memory, get_cuda_memory_usage());

    // Pass the input through the network (cloning the input)
    auto output = net->forward(input);
    // GPU Memory: 3 * (input)
    print_cuda_memory_diff(start_memory, get_cuda_memory_usage());

    delete net;
    // GPU Memory: 3 * (input) (no change) (due to cache)
    print_cuda_memory_diff(start_memory, get_cuda_memory_usage());

    // Manually clear the cache and return all unused GPU memory (just for this example) (may not be recommended)
    c10::cuda::CUDACachingAllocator::emptyCache();
    // GPU Memory: 1 * (input)
    print_cuda_memory_diff(start_memory, get_cuda_memory_usage());

    cudaFree(input.data_ptr());
    cudaFree(output.data_ptr());
    print_cuda_memory_diff(start_memory, get_cuda_memory_usage());
}

int main()
{
    auto size = 10000;
    auto start_memory = get_cuda_memory_usage();

    memory_test(size, start_memory);

    return 0;
}