How to release the cuda memory occupied by creating variables?

The function code used is as follows:

def batch_prototype(feature,mask):  #return B*C*feature_size
    batch_pro = torch.zeros(mask.shape[0], mask.shape[1], feature.shape[1]).to('cuda')
    for i in range(mask.shape[1]):
        classmask = mask[:,i,:,:]
        proclass = masked_average_pooling(feature,classmask.unsqueeze(1))
        batch_pro[:,i,:] = proclass
    return batch_pro

This piece of code is used in decode:

class Decoder_idea(nn.Module):
    def __init__(self, params):
        super(Decoder_idea, self).__init__()
        self.params = params
        self.in_chns = self.params['in_chns']
        self.ft_chns = self.params['feature_chns']
        self.n_class = self.params['class_num']
        self.bilinear = self.params['bilinear']
        assert (len(self.ft_chns) == 5)

        self.up1 = UpBlock(
            self.ft_chns[4], self.ft_chns[3], self.ft_chns[3], dropout_p=0.0)
        self.up2 = UpBlock(
            self.ft_chns[3], self.ft_chns[2], self.ft_chns[2], dropout_p=0.0)
        self.up3 = UpBlock(
            self.ft_chns[2], self.ft_chns[1], self.ft_chns[1], dropout_p=0.0)
        self.up4 = UpBlock(
            self.ft_chns[1], self.ft_chns[0], self.ft_chns[0], dropout_p=0.0)

        self.out_conv = nn.Conv2d(self.ft_chns[0], self.n_class,
                                  kernel_size=3, padding=1)

    def forward(self, feature):
        x0 = feature[0]
        x1 = feature[1]
        x2 = feature[2]
        x3 = feature[3]
        x4 = feature[4]

        x = self.up1(x4, x3)
        up1_out=x
        x = self.up2(x, x2)
        up2_out=x
        x = self.up3(x, x1)
        up3_out=x
        x = self.up4(x, x0)
        up4_out=x
        # print(x.shape,'feature_shape')
        output = self.out_conv(x)
        mask = torch.softmax(output,dim=1)
        **batch_pro = batch_prototype(x,mask)**
        torch.cuda.empty_cache()
        intermediate_out = {
            'up1': up1_out,
            'up2': up2_out,
            'up3': up3_out,
            'up4': up4_out,
        }
        return output, intermediate_out, batch_pro

Now my problem is that every time I use the batch_prototype function to calculate batch_pro, the occupied cuda memory cannot be released. Therefore, as time goes by, I will be prompted to exceed the memory that can be allocated by cuda. How can I solve this problem?