CUDA memory leak after "CUDA out of memory." error

Hello, I got a trouble with CUDA memory

I want to slice image input after ‘CUDA out of memory.’ occured.
but, after ‘CUDA out of memory.’ error , MEMORY LEAK occured
It seems like input random tensor “x” at line 79 [at inf class ,run function] didn’t free exactly.
But there’s NO way to free it.
Is there any idea to free it ??

HELP ME…

I tried below but it didn’t work…

del x
x.to(torch.device(‘cpu’)

**SYSTEM & ENV**
GPU : NVIDIA 2080 ti
OS : windows 10 + linux(centos)
pytorch 1.5.1
import torch
from torch import nn
from math import sqrt
# import torch.multiprocessing as mp
import time

def check_gpu(msg='gpu_check'):
    print(f'{msg:=^60}')
    # print('Memory Usage:')
    print(f'Allocated:, {round(torch.cuda.memory_allocated(0)/1024**3,6)}GB')
    # # print('Cached:   ', round(torch.cuda.memory_cached(0)/1024**3,6), 'GB')
    
    # import gc
    # for i, obj in enumerate(gc.get_objects()):
    #     try:
    #         if torch.is_tensor(obj) or (hasattr(obj, 'data') and torch.is_tensor(obj.data)):
    #             print(f'{i}||{type(obj)}||{obj.size()}')
    #     except:
    #         pass


class Conv_ReLU_Block(nn.Module):
    def __init__(self):
        super(Conv_ReLU_Block, self).__init__()
        self.conv = nn.Conv2d(in_channels=64, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
        self.relu = nn.ReLU(inplace=True)
        
    def forward(self, x):
        return self.relu(self.conv(x))
        
# resnet network
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.residual_layer = self.make_layer(Conv_ReLU_Block, 18)
        self.input = nn.Conv2d(in_channels=1, out_channels=64, kernel_size=3, stride=1, padding=1, bias=False)
        self.output = nn.Conv2d(in_channels=64, out_channels=1, kernel_size=3, stride=1, padding=1, bias=False)
        self.relu = nn.ReLU(inplace=True)
    
        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
                m.weight.data.normal_(0, sqrt(2. / n))
                
    def make_layer(self, block, num_of_layer):
        layers = []
        for _ in range(num_of_layer):
            layers.append(block())
        return nn.Sequential(*layers)

    def forward(self, x):
        # residual = x
        out = self.relu(self.input(x))
        out = self.residual_layer(out)
        out = self.output(out)

        return out
   
# inference class
class inf:
    # init model
    def __init__(self):
        check_gpu('init')
        self.m = Net().to(torch.device('cuda'))
        self.m.eval()
        # self.m.share_memory()
    
    # run inference with random tensor
    # if tensor size (1,1,50000,1024) 
    # -> CUDA out of memory.
    # -> memory leak occured while loop at main
    #
    # if tensor size (1,1,1024,1024) 
    # -> great
    
    def run(self):
        check_gpu('run start')
        try:
            with torch.no_grad():
                # x = torch.rand(1, 1, 50000, 1024).cuda()
                x = torch.rand(1, 1, 1024, 1024, device='cuda')
                check_gpu('allocated')
                x = self.m(x)
                
        except Exception as e:
            print(e)
        finally:
            torch.cuda.empty_cache()

def main():
    inf_ins = inf()
    
    for _ in range(100):
        t1 = time.time()
        
        inf_ins.run()
        
        t2 = time.time()
        
        # check_gpu('last')
        print(f'time: {t2-t1}')

    
if __name__ == '__main__':
    main()

result if use tensor (1,1,50000,1024) as input tensor
allocated cuda memory rising

============================init============================
Allocated:, 0.0GB
=========================run start==========================
Allocated:, 0.002477GB
=========================allocated==========================
Allocated:, 0.193883GB
CUDA out of memory. Tried to allocate 12.21 GiB (GPU 0; 11.00 GiB total capacity; 198.54 MiB already allocated;
time: 0.3150339126586914
=========================run start==========================
Allocated:, 0.193883GB
=========================allocated==========================
Allocated:, 0.385289GB
CUDA out of memory. Tried to allocate 12.21 GiB (GPU 0; 11.00 GiB total capacity; 394.54 MiB already allocated;
time: 0.31797289848327637
=========================run start==========================
Allocated:, 0.385289GB
=========================allocated==========================
Allocated:, 0.576695GB
CUDA out of memory. Tried to allocate 12.21 GiB (GPU 0; 11.00 GiB total capacity; 590.54 MiB already allocated;
time: 0.31999874114990234
=========================run start==========================
Allocated:, 0.576695GB
=========================allocated==========================
Allocated:, 0.768102GB
CUDA out of memory. Tried to allocate 12.21 GiB (GPU 0; 11.00 GiB total capacity; 786.54 MiB already allocated;
time: 0.3209991455078125
=========================run start==========================
Allocated:, 0.768102GB
=========================allocated==========================
Allocated:, 0.959508GB
CUDA out of memory. Tried to allocate 12.21 GiB (GPU 0; 11.00 GiB total capacity; 982.54 MiB already allocated;
time: 0.31999993324279785

if use (1,1,1024,1024) as input tensor

============================init============================
Allocated:, 0.0GB
=========================run start==========================
Allocated:, 0.002477GB
=========================allocated==========================
Allocated:, 0.006383GB
time: 1.4230003356933594
=========================run start==========================
Allocated:, 0.002477GB
=========================allocated==========================
Allocated:, 0.006383GB
time: 0.14099979400634766
=========================run start==========================
Allocated:, 0.002477GB
=========================allocated==========================
Allocated:, 0.006383GB
time: 0.13899970054626465