CUDA Memory Profiling

I’ve try a new approach using torch.cuda.max_memory_allocated() for this I’ve created a class as mentionned on this topic and try to call it in my training loop. The fact is that even if I reorganize my layer repartition between gpus the memory allocated on both gpus continue to grow and the peak are always egual to the first run.

As example if I have a model with 14 layers and I split them for the first run with GPU0 11 and GPU1 3 I will have peak at [1371, 527] MB and even if during the same process I swap some layers to have 7 layers on bot I will always have the same peaks value. And if I run it firstly with 7 on both I will have other memory peak value [785, 673] MB

So the first issue that I imagine is that the memory is never released and never reallocated how to do it and none of torch.cuda reset method works…

Here is my profiling class :

import torch
def BToMb(x): return (x // (2*1024))

class TraceMalloc():
    def __init__(self, nb_gpu):
        self.nb_gpu = nb_gpu
        self.begin  = [0] * nb_gpu
        self.end    = [0] * nb_gpu
        self.peak   = [0] * nb_gpu
        self.peaked = [0] * nb_gpu

    def __enter__(self):
        for device in range(self.nb_gpu):
            torch.cuda.reset_accumulated_memory_stats(device)
            self.begin[device] = torch.cuda.memory_allocated(device)


        return self
    
    def __exit__(self, *exc):
        for device in range(self.nb_gpu):
            self.end[device]    = torch.cuda.memory_allocated(device)
            self.peak[device]   = torch.cuda.max_memory_allocated(device)
            self.peaked[device] = BToMb(self.peak[device] - self.begin[device])
            torch.cuda.reset_accumulated_memory_stats(device)

        for device in range(self.nb_gpu):
            print(f"GPU n°{device}")
            print(f"    Memory begin -> {BToMb(self.begin[device])} MB")
            print(f"    Memory end   -> {BToMb(self.end[device])} MB")
            # print(f"Memory used  -> {self.used[device]} MB")
            print(f"    Memory peak  -> {self.peaked[device]} MB")

Here my training loop where it is call :

def evaluate_mem(self):
        from .dataset import PipelineDataset


        dataset = PipelineDataset(1024, self.input_shape[1:], [1] if len(self.output_shape) == 1 else self.output_shape[1:])
        dataloader = torch.utils.data.DataLoader(dataset, batch_size=self.input_shape[0], shuffle=True)

        criterion = torch.nn.CrossEntropyLoss()

        model = self.get_modules()
        model = torch.distributed.pipeline.sync.Pipe(model, 2)
        optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

        # self.prof.start()
        # with torch.no_grad():
        with self.trace_gpu_alloc:
            for epoch in range(10):
                running_loss = 0.0
                for i, (inputs, labels) in enumerate(dataloader):

                    # optimizer.zero_grad()
                    inputs = inputs.to(0)
                    labels = labels.to(self.nb_gpu - 1)

                    outputs = model(inputs).local_value()

                    # Forward pass
                    loss = criterion(outputs, labels.squeeze())

                    loss.backward()
                    # self.prof.step()
                    optimizer.step()


        torch.cuda.empty_cache()
        print(f"Outer loop --------> {self.trace_gpu_alloc.peaked}")

        return self.trace_gpu_alloc.peaked

And finally an example of model used :

class x_layer(nn.Module):
    def forward(self, input):
        ret = input
        return input

class conv1_layer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.Conv2d(3, 6, kernel_size=(5, 5), stride=(1, 1))
    def forward(self, input):
        ret = self.fc(input)
        return ret

class relu_layer(nn.Module):
    def forward(self, input):
        ret = torch.nn.functional.relu(input, inplace=False)
        return ret

class pool_layer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    def forward(self, input):
        ret = self.fc(input)
        return ret

class conv2_layer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.Conv2d(6, 16, kernel_size=(5, 5), stride=(1, 1))
    def forward(self, input):
        ret = self.fc(input)
        return ret

class relu_1_layer(nn.Module):
    def forward(self, input):
        ret = torch.nn.functional.relu(input, inplace=False)
        return ret

class pool_1_layer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    def forward(self, input):
        ret = self.fc(input)
        return ret

class flatten_layer(nn.Module):
    def forward(self, input):
        ret = torch.flatten(input, 1)
        return ret

class fc1_layer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.Linear(in_features=400, out_features=120, bias=True)
    def forward(self, input):
        ret = self.fc(input)
        return ret

class relu_2_layer(nn.Module):
    def forward(self, input):
        input = input.clone()
        ret = torch.nn.functional.relu(input, inplace=False)
        return ret

class fc2_layer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.Linear(in_features=120, out_features=84, bias=True)
    def forward(self, input):
        ret = self.fc(input)
        return ret

class relu_3_layer(nn.Module):
    def forward(self, input):
        ret = torch.nn.functional.relu(input, inplace=False)
        return ret

class fc3_layer(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.fc = nn.Linear(in_features=84, out_features=10, bias=True)
    def forward(self, input):
        ret = self.fc(input)
        return ret

class output_layer(nn.Module):
    def forward(self, input):
        ret = input
        return ret

class PipelinedModel(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.s0 = nn.Sequential(x_layer(), conv1_layer(), relu_layer(), pool_layer(), conv2_layer(), relu_1_layer(), pool_1_layer(), flatten_layer(), fc1_layer(), relu_2_layer()).cuda(0)
        self.s1 = nn.Sequential(fc2_layer(), relu_3_layer(), fc3_layer(), output_layer()).cuda(1)
    def forward(self, input):
        ret = input
        ret = self.s0(ret.to(0))
        ret = self.s1(ret.to(1))
        return ret
    def get_modules(self):
        return  nn.Sequential(*[nn.Sequential(*self.s0),nn.Sequential(*self.s1)])

Hope I was clear in my explanation :x