PyTorch GPU memory allocation issues (GiB reserved in total by PyTorch)

Hello,
I have defined a densenet architecture in PyTorch to use it on training data consisting of 15000 samples of 128x128 images. Here is the code:

class Dense_Block(nn.Module):
    def __init__(self, in_channels):
        super(Dense_Block, self).__init__()

        self.relu = nn.ReLU(inplace = True)
        self.bn = nn.BatchNorm2d(num_features = in_channels)

        self.conv1 = nn.Conv2d(in_channels = in_channels, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv2 = nn.Conv2d(in_channels = 32, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv3 = nn.Conv2d(in_channels = 64, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv4 = nn.Conv2d(in_channels = 96, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)
        self.conv5 = nn.Conv2d(in_channels = 128, out_channels = 32, kernel_size = 3, stride = 1, padding = 1)

    def forward(self, x):

        bn = self.bn(x)
        conv1 = self.relu(self.conv1(bn))

        conv2 = self.relu(self.conv2(conv1))
        c2_dense = self.relu(torch.cat([conv1, conv2], 1))

        conv3 = self.relu(self.conv3(c2_dense))
        c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))

        conv4 = self.relu(self.conv4(c3_dense))
        c4_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4], 1))

        conv5 = self.relu(self.conv5(c4_dense))
        c5_dense = self.relu(torch.cat([conv1, conv2, conv3, conv4, conv5], 1))

        return c5_dense

class Transition_Layer(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Transition_Layer, self).__init__()

        self.relu = nn.ReLU(inplace = True)
        self.bn = nn.BatchNorm2d(num_features = out_channels)
        self.conv = nn.Conv2d(in_channels = in_channels, out_channels = out_channels, kernel_size = 1, bias = False)
        self.avg_pool = nn.AvgPool2d(kernel_size = 2, stride = 2, padding = 0)

    def forward(self, x):

        bn = self.bn(self.relu(self.conv(x)))
        out = self.avg_pool(bn)

        return out

class DenseNet(nn.Module):
    def __init__(self, nr_classes):
        super(DenseNet, self).__init__()

        self.lowconv = nn.Conv2d(in_channels = 1, out_channels = 64, kernel_size = 7, padding = 3, bias = False)
        self.relu = nn.ReLU()

        # Make Dense Blocks
        self.denseblock1 = self._make_dense_block(Dense_Block, 64)
        self.denseblock2 = self._make_dense_block(Dense_Block, 128)
        self.denseblock3 = self._make_dense_block(Dense_Block, 128)

        # Make transition Layers
        self.transitionLayer1 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
        self.transitionLayer2 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 128)
        self.transitionLayer3 = self._make_transition_layer(Transition_Layer, in_channels = 160, out_channels = 64)

        # Classifier
        self.bn = nn.BatchNorm2d(num_features = 64)
        self.pre_classifier = nn.Linear(64*16*16, 512)
        self.classifier = nn.Linear(512, nr_classes)

    def _make_dense_block(self, block, in_channels):
        layers = []
        layers.append(block(in_channels))
        return nn.Sequential(*layers)

    def _make_transition_layer(self, layer, in_channels, out_channels):
        modules = []
        modules.append(layer(in_channels, out_channels))
        return nn.Sequential(*modules)

    def forward(self, x):
        out = self.relu(self.lowconv(x))

        out = self.denseblock1(out)
        out = self.transitionLayer1(out)

        out = self.denseblock2(out)
        out = self.transitionLayer2(out)

        out = self.denseblock3(out)
        out = self.transitionLayer3(out)

        out = self.bn(out)
#         print(out.shape)
        out = out.reshape(-1, 64*16*16)

        out = self.pre_classifier(out)
        out = self.classifier(out)

        return out

Then I define my DataSet class:

class MyDataset(Dataset):
    def __init__(self, images, n, labels=None, transforms=None):
        self.X = images
        self.y = labels
        self.n = n
        self.transforms = transforms
         
    def __len__(self):
        return (len(self.X))
    
    def __getitem__(self, i):
        data = self.X.iloc[i, :]
        data = np.asarray(data).astype(np.float).reshape(1,n,n)
        
        if self.transforms:
            data = self.transforms(data).reshape(1,n,n)
            
        if self.y is not None:
            y = self.y.iloc[i,:]
            y = np.asarray(y).astype(np.float).reshape(128,) # for 128-vector of labels
            return (data, y)
        else:
            return data

Then I create the instances of the train, dev, and test data:

train_data = MyDataset(train_images, n, train_labels, None)
dev_data = MyDataset(dev_images, n, dev_labels, None)
test_data = MyDataset(test_images, n, test_labels, None)

The shapes of train_images , dev_images and test_images are respectively (15000, 16384) , (4000, 16384) and (1000, 16384) . So there are in total 20000 samples of 128x128 (=16384) images.

The shapes of train_labels , dev_labels and test_labels are respectively (15000, 128) , (4000, 128) and (1000, 128) . So there are in total 20000 samples of 128 vectors.

I define also a custom loss function:

# Huber
def Huber(yHat,y,delta=1.):
    n_samples = yHat.size()[0]
    n_points = yHat.size()[1]
    preds = yHat
    labels = y
    size = yHat.size()[0]*yHat.size()[1]
    diff = yHat - y
    return torch.sum(torch.where(torch.abs(diff) < delta,.5*diff**2 , delta*(torch.abs(diff)-.5*delta**2))) / size

Then I create an instance of the model:

densenet = DenseNet(nr_classes=128).float().to('cuda:0')

Then I initialize parameters, create train- and dev-set dataloaders, and train the model using Adam optimizer and Huber loss-function:

def main():
    nn.init.kaiming_uniform_(list(densenet.parameters())[0], nonlinearity = 'relu')
    loader = DataLoader(train_data,batch_size=200,shuffle=False,num_workers=0)
    loader_dev = DataLoader(dev_data,batch_size=10,shuffle=None,num_workers=0)
    N_epochs = 10
    for epoch in range(N_epochs):
          optimizer = optim.Adam(densenet.parameters(), lr=.001, betas=(0.9, 0.999), eps=1e-08)
          for batch in loader:
                images = batch[0].float().to('cuda:0')
                labels = batch[1].float().to('cuda:0')
                preds = densenet(images)
                loss = Loss(preds,labels).Huber()
            
                with torch.no_grad():
                    loss_dev = 0
                    for batch_dev in loader_dev:
                        images_dev = batch_dev[0].float().to('cuda:0')
                        labels_dev = batch_dev[1].float().to('cuda:0')
                        preds_dev = densenet(images_dev)
                        loss_ = Loss(preds_dev,labels_dev).Huber()
                        loss_dev += loss_

                 optimizer.zero_grad()
                 loss.backward()
                 optimizer.step()

if __name__ == '__main__':
    multiprocessing.freeze_support()
    main()

I have two GPUs identified as cuda:0 (~ 24 Gb memory) and cuda:1 (~ 6 Gb memory)
With cuda:1 device I get the error message:


  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1127, in <module>
    main()

  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1074, in main
    preds = network(images) # Pass Batch

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 752, in forward
    out = self.denseblock1(out)

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\container.py", line 117, in forward
    input = module(input)

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 691, in forward
    c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))

RuntimeError: CUDA out of memory. Tried to allocate 1.17 GiB (GPU 1; 6.00 GiB total capacity; 4.34 GiB already allocated; 16.62 MiB free; 4.34 GiB reserved in total by PyTorch)

Then I tried to switch to cuda:0 device which has much more memory capacity, and the error in this case reads as:


  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1127, in <module>
    main()

  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 1074, in main
    preds = network(images) # Pass Batch

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 752, in forward
    out = self.denseblock1(out)

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\container.py", line 117, in forward
    input = module(input)

  File "C:\Users\Admin\.conda\envs\pytorch_env\lib\site-packages\torch\nn\modules\module.py", line 722, in _call_impl
    result = self.forward(*input, **kwargs)

  File "D:\Jupiter_playground\fashion_mnist_tidied.py", line 691, in forward
    c3_dense = self.relu(torch.cat([conv1, conv2, conv3], 1))

RuntimeError: CUDA out of memory. Tried to allocate 1.17 GiB (GPU 0; 24.00 GiB total capacity; 21.59 GiB already allocated; 372.94 MiB free; 21.69 GiB reserved in total by PyTorch)

Why does PyTorch allocate almost all available memory?

However, when I use train-set of 6 images and dev-set of 3 images (test-set of 1 image), training with cuda-devices works fine.

Because ur batch-size is 200 and densenet is demanding. Why don’t you use smaller BS and increase up the the gpu memory?
I mean, according to the code your batch size is 200.

Densenet is heavy (densily conected)

Thanks for your reply. What do you mean by increasing the gpu memory? Buying a new one?

Increasing the memory demand. I mean, start from a small batch size and increase it checking gpu usage with nvidia-smi. You need to compute the backward as well since it requires memory. When you see you can compute forward and backward reaching the memory limit thats the maximum batch siD you can use

1 Like

Thanks for the tip! I guess this is a good systematic approach :slight_smile: