Cuda out of memory after some batches

ilyes · March 18, 2022, 8:05pm

Hello,

I am getting CUDA out of memory after iterating over some batches. I am guessing the computation graph is being saved somewhere, but I can’t figure it out

  File "/home/ubuntu/repositories/GraphExp/train.py", line 105, in train
    out, loss_value = self._pass(input, phase="train")
  File "/home/ubuntu/repositories/GraphExp/train.py", line 90, in _pass
    loss_value.backward()
  File "/home/ubuntu/miniconda3/envs/gatv2/lib/python3.7/site-packages/torch/_tensor.py", line 363, in backward
    torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
  File "/home/ubuntu/miniconda3/envs/gatv2/lib/python3.7/site-packages/torch/autograd/__init__.py", line 175, in backward
    allow_unreachable=True, accumulate_grad=True)  # Calls into the C++ engine to run the backward pass
RuntimeError: CUDA out of memory. Tried to allocate 1.84 GiB (GPU 0; 11.17 GiB total capacity; 10.03 GiB already allocated; 316.25 MiB free; 10.50 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

here’s my training loop


    def train(self):
        # breakpoint()
        self.model.train()
        avg_loss = AverageMeter()
        avg_top1 = AverageMeter()
        avg_top5 = AverageMeter()
        early_stopping_counter = 0
        for epoch in range(self.config["training_config"]["epoch"]):
            for batch_idx, input in enumerate(self.dataloaders["train"]):
                input = input.to(self.device) 
                out, loss_value = self._pass(input, phase="train")
                avg_loss.update(loss_value, input.num_graphs)
                top1, top5 = self.cls_accuracy(output=out.detach().cpu().data, target=input.y.detach().cpu().data, topk=(1, 5),
                                               n_classes=self.config["data_config"]["num_classes"])
                avg_top1.update(top1, input.num_graphs)
                avg_top5.update(top5, input.num_graphs)
                self._log_training(epoch, batch_idx, loss_value)


class AverageMeter:
    def __init__(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

    @property
    def value(self):
        return round(self.avg, 8)


    def _pass(self, data, phase="val"):
        out = self.model(data.x, data.edge_index, data.batch)
        out = F.softmax(out, dim=1)
        loss_value = self.criterion(out, data.y)
        if phase =="train":
            loss_value.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()
        return out, loss_value.detach().cpu().item()


    @staticmethod
    def cls_accuracy(output, target, n_classes, topk=(1,)):
        _, pred = output.topk(max(topk), 1, True, True)
        idx = (pred[:, 0] < n_classes)
        pred = pred[idx, :]
        target = target[idx]
        pred = pred.t()
        correct = pred.eq(target.view(1, -1).expand_as(pred))
        res = []
        for k in topk:
            correct_k = correct[:k].float().max(0)[0].sum()
            res.append((correct_k / target.size(0)).detach().item())
        return res


>>> input
DataBatch(x=[874, 1032], edge_index=[2, 24908], edge_attr=[24908, 2], y=[48], batch=[874], ptr=[49])
>>> input.x.is_leaf
True

Im using pytorch geometric.
the CUDA is out of memory after 78 batches out of 119.
DO you see any memory leak here ?
Thank you

Samuel_Bachorik · March 18, 2022, 9:17pm

In my experience, this always happened to me when i was adding += something like you do in this method. I see val is detached but what about n ?

ilyes · March 19, 2022, 10:46pm

input.num_graphs is a float