nn.Linear dones't free memory after processing input

class BertIntermediate(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
        if isinstance(config.hidden_act, str):
            self.intermediate_act_fn = ACT2FN[config.hidden_act]
        else:
            self.intermediate_act_fn = config.hidden_act

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        hidden_states = self.dense(hidden_states)
        hidden_states = self.intermediate_act_fn(hidden_states)
        return hidden_states

How to free the memory after passing input hidden_states = self.dense(hidden_states)

self.dense =nn.Linear(config.hidden_size, config.intermediate_size)

consumes 67 MB for shape of 1900 X 512.

because it keeps on accumulating every-time, I pass new sample for inference

Is there a way to free this memory, because calling everytime this on new sample keeps on accumulating this memory and results in memory leakage
@ptrblck

I cannot reproduce any memory leak and see the expected memory usage:

import torch
import torch.nn as nn

class BertIntermediate(nn.Module):
    def __init__(self):
        super().__init__()
        self.dense = nn.Linear(1024, 1024)
        self.intermediate_act_fn = nn.ReLU()

    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
        print("start of forward, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
        hidden_states = self.dense(hidden_states)
        print("after self.dense, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
        hidden_states = self.intermediate_act_fn(hidden_states)
        print("after act, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
        return hidden_states

print("start, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
# start, 0.000MB allocated
device = "cuda"
model = BertIntermediate().to(device)
x = torch.randn(1024, 1024, device=device)
print("after setup, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
# after setup, 8.004MB allocated

with torch.no_grad():
    out = model(x)
print("after forward, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
# start of forward, 8.004MB allocated
# after self.dense, 20.129MB allocated
# after act, 20.129MB allocated
# after forward, 20.129MB allocated

with torch.no_grad():
    out = model(x)
print("after forward, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
# start of forward, 20.129MB allocated
# after self.dense, 24.129MB allocated
# after act, 24.129MB allocated
# after forward, 20.129MB allocated

with torch.no_grad():
    out = model(x)
print("after forward, {:.3f}MB allocated".format(torch.cuda.memory_allocated()/1024**2))
# start of forward, 20.129MB allocated
# after self.dense, 24.129MB allocated
# after act, 24.129MB allocated
# after forward, 20.129MB allocated

not the cuda memory but virtual memory @ptrblck