Hello,
I am getting CUDA out of memory after iterating over some batches. I am guessing the computation graph is being saved somewhere, but I can’t figure it out
File "/home/ubuntu/repositories/GraphExp/train.py", line 105, in train
out, loss_value = self._pass(input, phase="train")
File "/home/ubuntu/repositories/GraphExp/train.py", line 90, in _pass
loss_value.backward()
File "/home/ubuntu/miniconda3/envs/gatv2/lib/python3.7/site-packages/torch/_tensor.py", line 363, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/ubuntu/miniconda3/envs/gatv2/lib/python3.7/site-packages/torch/autograd/__init__.py", line 175, in backward
allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
RuntimeError: CUDA out of memory. Tried to allocate 1.84 GiB (GPU 0; 11.17 GiB total capacity; 10.03 GiB already allocated; 316.25 MiB free; 10.50 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
here’s my training loop
def train(self):
# breakpoint()
self.model.train()
avg_loss = AverageMeter()
avg_top1 = AverageMeter()
avg_top5 = AverageMeter()
early_stopping_counter = 0
for epoch in range(self.config["training_config"]["epoch"]):
for batch_idx, input in enumerate(self.dataloaders["train"]):
input = input.to(self.device)
out, loss_value = self._pass(input, phase="train")
avg_loss.update(loss_value, input.num_graphs)
top1, top5 = self.cls_accuracy(output=out.detach().cpu().data, target=input.y.detach().cpu().data, topk=(1, 5),
n_classes=self.config["data_config"]["num_classes"])
avg_top1.update(top1, input.num_graphs)
avg_top5.update(top5, input.num_graphs)
self._log_training(epoch, batch_idx, loss_value)
class AverageMeter:
def __init__(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def reset(self):
self.val = 0
self.avg = 0
self.sum = 0
self.count = 0
def update(self, val, n=1):
self.val = val
self.sum += val * n
self.count += n
self.avg = self.sum / self.count
@property
def value(self):
return round(self.avg, 8)
def _pass(self, data, phase="val"):
out = self.model(data.x, data.edge_index, data.batch)
out = F.softmax(out, dim=1)
loss_value = self.criterion(out, data.y)
if phase =="train":
loss_value.backward()
self.optimizer.step()
self.optimizer.zero_grad()
return out, loss_value.detach().cpu().item()
@staticmethod
def cls_accuracy(output, target, n_classes, topk=(1,)):
_, pred = output.topk(max(topk), 1, True, True)
idx = (pred[:, 0] < n_classes)
pred = pred[idx, :]
target = target[idx]
pred = pred.t()
correct = pred.eq(target.view(1, -1).expand_as(pred))
res = []
for k in topk:
correct_k = correct[:k].float().max(0)[0].sum()
res.append((correct_k / target.size(0)).detach().item())
return res
>>> input
DataBatch(x=[874, 1032], edge_index=[2, 24908], edge_attr=[24908, 2], y=[48], batch=[874], ptr=[49])
>>> input.x.is_leaf
True
Im using pytorch geometric.
the CUDA is out of memory after 78 batches out of 119.
DO you see any memory leak here ?
Thank you