Hello,
I am wondering why the memory is not being released after inference, the following is my code (transformer is imported from huggingface):
@torch.no_grad()
def inference(self, hidden_state, mask, id):
self.eval()
print("begin_max", torch.cuda.max_memory_allocated("cuda:4")/ 1024 / 1024)
print("begin", torch.cuda.memory_allocated("cuda:4")/ 1024 / 1024)
distilbert_output = self.bert(inputs_embeds=hidden_state, attention_mask=mask, return_dict=False)
print("middle_max", torch.cuda.max_memory_allocated("cuda:4")/ 1024 / 1024)
print("middle", torch.cuda.memory_allocated("cuda:4")/ 1024 / 1024)
gc.collect()
torch.cuda.empty_cache()
gc.collect()
hidden_state = distilbert_output[0]
pooled_output = hidden_state[:, 0]
x = pooled_output
x = F.dropout(x, p=args.dropout, training=self.training)
del hidden_state, pooled_output
print("final_max", torch.cuda.max_memory_allocated("cuda:4")/ 1024 / 1024)
print("final", torch.cuda.memory_allocated("cuda:4")/ 1024 / 1024)
for i, lin in enumerate(self.lins[:-1]):
x = lin(x)
#x = self.bns[i](x)
x = F.relu(x)
x = F.dropout(x, p=args.dropout, training=self.training)
x = self.lins[-1](x)
self.z_mlp = self.z_mlp.to(device)
self.z_mlp[id] = x.clone().detach()
print("final2", torch.cuda.max_memory_allocated("cuda:4")/ 1024 / 1024)
torch.cuda.empty_cache()
return x
Here are the results:
begin_max 4217.28662109375
begin 4217.28662109375
middle_max 39844.28662109375
middle 7967.28662109375
final_max 39844.28662109375
final 7996.58349609375
It seems after either distilbert_output = self.bert(inputs_embeds=hidden_state, attention_mask=mask, return_dict=False)
or MLP, the memory still increases, do I miss something? Is this a memory leakage?
Any help would be appreciated!