Unable to free all GPU memory

Hello all,

I have read many threads about ways to free memory and I wrote a simple example that tested my code, I believe I’m still missing something but cant seem to find what is it that I’m missing.

As to my knowledge I moved all of the Tensors to CPU and deleted them, I thought that should free the memory.

I’ve created a loop that every epoch clears the GPU memory, then it initiates a new model and trains. I thought that when ill stop in debug mode right after the “wipe_memory” call ill see that the GPU memory is free.

import gc

import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor

# Define model
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            nn.Linear(28*28, 512),
            nn.ReLU(),
            nn.Linear(512, 512),
            nn.ReLU(),
            nn.Linear(512, 10)
        )

    def forward(self, x):
        x = self.flatten(x)
        logits = self.linear_relu_stack(x)
        return logits



class Fitter:
    def __init__(self, loss_fn):
        self.model = self.init_model()
        self.optimizer = self.init_optimizer()
        self.loss_fn = loss_fn

    def init_model(self):
        return NeuralNetwork()

    def init_optimizer(self):
        return torch.optim.SGD(self.model.parameters(), lr=1e-3)

    def wipe_memory(self): # DOES WORK
        self._optimizer_to(torch.device('cpu'))
        self._model_to(torch.device('cpu'))
        del self.model
        del self.optimizer
        gc.collect()
        torch.cuda.empty_cache()

    @staticmethod
    def _param_to_device(param, device):
        # Not sure there are any global tensors in the state dict
        if isinstance(param, torch.Tensor):
            param.data = param.data.to(device)
            if param._grad is not None:
                param._grad.data = param._grad.data.to(device)
        elif isinstance(param, dict):
            for subparam in param.values():
                if isinstance(subparam, torch.Tensor):
                    subparam.data = subparam.data.to(device)
                    if subparam._grad is not None:
                        subparam._grad.data = subparam._grad.data.to(device)

    def _optimizer_to(self, device):
        if self.optimizer.state is not None:
            for param in self.optimizer.state.values():
                self._param_to_device(param, device)

    def _model_to(self, device):
        if self.model.state_dict is not None:
            for param in self.model.state_dict().values():
                self._param_to_device(param, device)

    def train(self, dataloader, device):
        print(torch.cuda.memory_summary())
        self.wipe_memory()
        
        #### at this stage I should see that the memory is free, no? ####
        self.model = self.init_model()
        self.optimizer = self.init_optimizer()
        self.model.to(device)
        self.model.train()
        size = len(dataloader.dataset)

        for batch, (X, y) in enumerate(dataloader):
            X, y = X.to(device), y.to(device)

            # Compute prediction error
            pred = self.model(X)
            loss = self.loss_fn(pred, y)

            # Backpropagation
            loss.backward()
            self.optimizer.step()
            self.optimizer.zero_grad()

            if batch % 100 == 0:
                loss, current = loss.item(), (batch + 1) * len(X)
                print(f"loss: {loss:>7f}  [{current:>5d}/{size:>5d}]")


#
#
if __name__ == '__main__':

    # Download training data from open datasets.
    training_data = datasets.FashionMNIST(
        root="data",
        train=True,
        download=True,
        transform=ToTensor(),
    )

    # Download test data from open datasets.
    test_data = datasets.FashionMNIST(
        root="data",
        train=False,
        download=True,
        transform=ToTensor(),
    )

    batch_size = 64

    # Create data loaders.
    train_dataloader = DataLoader(training_data, batch_size=batch_size)
    test_dataloader = DataLoader(test_data, batch_size=batch_size)

    for X, y in test_dataloader:
        print(f"Shape of X [N, C, H, W]: {X.shape}")
        print(f"Shape of y: {y.shape} {y.dtype}")
        break

    loss_fn = nn.CrossEntropyLoss()
    fitter = Fitter(loss_fn)

    device = f'cuda:0'
    epochs = 1000
    for t in range(epochs):
        print(f"Epoch {t+1}\n-------------------------------")
        fitter.train(dataloader=train_dataloader, device=device)
    print("Done!")

Your help is much appreciated!

Based on your code structure, there should be no memory held there. What’s your memory at that stage?

Are you sure it’s not from something else?

Thank you for your reply.
Yes I am sure, just ran it once again and the second time I get to the line “self.wipe_memory()” I see that there is 406MB allocated, after the invocation there is still 402 left on device.

any suggestions?

by the way, if i comment the whole section “for batch, (X, y) in enumerate(dataloader):” the memory right after the 1st init is: 328MB, and after the wipe is 306MB

Would that be coming from the dataloader? I am not to sure to be honest.