Unable to free all GPU memory

Bar_Dubovski · August 11, 2024, 8:16am

Hello all,

I have read many threads about ways to free memory and I wrote a simple example that tested my code, I believe I’m still missing something but cant seem to find what is it that I’m missing.

As to my knowledge I moved all of the Tensors to CPU and deleted them, I thought that should free the memory.

I’ve created a loop that every epoch clears the GPU memory, then it initiates a new model and trains. I thought that when ill stop in debug mode right after the “wipe_memory” call ill see that the GPU memory is free.

import gc

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



class Fitter:
    def __init__(self, loss_fn):
        self.model = self.init_model()
        self.optimizer = self.init_optimizer()
        self.loss_fn = loss_fn

    def init_model(self):
        return NeuralNetwork()

    def init_optimizer(self):
        return torch.optim.SGD(self.model.parameters(), lr=1e-3)

    def wipe_memory(self): # DOES WORK
        self._optimizer_to(torch.device('cpu'))
        self._model_to(torch.device('cpu'))
        del self.model
        del self.optimizer
        gc.collect()
        torch.cuda.empty_cache()

    @staticmethod
    def _param_to_device(param, device):
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)

    def _optimizer_to(self, device):
        if self.optimizer.state is not None:
            for param in self.optimizer.state.values():
                self._param_to_device(param, device)

    def _model_to(self, device):
        if self.model.state_dict is not None:
            for param in self.model.state_dict().values():
                self._param_to_device(param, device)

    def train(self, dataloader, device):
        print(torch.cuda.memory_summary())
        self.wipe_memory()
        
        #### at this stage I should see that the memory is free, no? ####
        self.model = self.init_model()
        self.optimizer = self.init_optimizer()
        self.model.to(device)
        self.model.train()
        size = len(dataloader.dataset)

        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)

            # Compute prediction error
            pred = self.model(X)
            loss = self.loss_fn(pred, y)

            # Backpropagation
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            if batch % 100 == 0:
                loss, current = loss.item(), (batch + 1) * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


#
#
if __name__ == '__main__':

    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )

    batch_size = 64

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    for X, y in test_dataloader:
        print(f"Shape of X [N, C, H, W]: {X.shape}")
        print(f"Shape of y: {y.shape} {y.dtype}")
        break

    loss_fn = nn.CrossEntropyLoss()
    fitter = Fitter(loss_fn)

    device = f'cuda:0'
    epochs = 1000
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        fitter.train(dataloader=train_dataloader, device=device)
    print("Done!")

Your help is much appreciated!

Soumya_Kundu · August 12, 2024, 5:14am

Based on your code structure, there should be no memory held there. What’s your memory at that stage?

Are you sure it’s not from something else?

Bar_Dubovski · August 12, 2024, 7:00am

Thank you for your reply.
Yes I am sure, just ran it once again and the second time I get to the line “self.wipe_memory()” I see that there is 406MB allocated, after the invocation there is still 402 left on device.

any suggestions?

by the way, if i comment the whole section “for batch, (X, y) in enumerate(dataloader):” the memory right after the 1st init is: 328MB, and after the wipe is 306MB

Soumya_Kundu · August 12, 2024, 7:59am

Would that be coming from the dataloader? I am not to sure to be honest.