Very weird CUDA out of memory error

I have a really weird issue with my NLP model. My GPU has 4GB memory, the model and embeddings consume about 1GB and my mini batches of size 25 only around 5-10 MB. During first iteration it shows in nvidia-smi 1100 MB and by torch.cuda.memory_allocated() 680 MB of memory usage, but it prints out CUDA out of memory error. What is wrong with the model?

This is my model instance:

import torch
import torch.nn as nn
from torch.autograd import Variable
import torchtext.data as data
import torch.nn.modules.batchnorm as batchnorm
import torch.nn.functional as F
import pdb

class Model(nn.Module):
    def __init__(self, **params):
        super(Model, self).__init__()
        self.channel_size = params['channels']
        self.conv_region_embedding = nn.Conv2d(1, self.channel_size, (3, params['emb_size']), stride=1)
        self.conv3 = nn.Conv2d(self.channel_size, self.channel_size, (3, 1), stride=1)
        self.pooling = nn.MaxPool2d(kernel_size=(3, 1), stride=2)
        self.padding_conv = nn.ZeroPad2d((0, 0, 1, 1))
        self.padding_pool = nn.ZeroPad2d((0, 0, 0, 1))
        self.act_fun = nn.ReLU()
        self.linear_out = nn.Linear(self.channel_size, 3)
        self.dropout = nn.Dropout2d(params['dropout'])
        self.ad_pool = lambda x: F.adaptive_max_pool2d(x, (1, 1)).cuda()

    def forward(self, x):
        batch = x.shape[0]
        x = x.view(x.size(0), 1, x.size(1), x.size(2))

        x = self.conv_region_embedding(x)
        x = self.dropout(x)
        
        print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
        
        px = self.pooling(x)
        x = self.pooling(x)
        
        x = self.padding_conv(x) 
        x = self.act_fun(x)
        x = self.conv3(x)
        x = self.dropout(x)
        
        print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
        
        x = self.padding_conv(x)
        x = self.act_fun(x)
        x = self.conv3(x)
        x = self.dropout(x)
        
        print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
        
        x = x + px
        
        del px

        while x.size()[-2] > 2:
            x = self._block(x)
            print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
        
        x = self.ad_pool(x)
        
        print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
        x = x.view(batch, self.channel_size)
        
        x = self.linear_out(x)
        
        return x

    def _block(self, x):
        px = self.pooling(x)
        x = self.pooling(x)

        # Convolution
        x = self.padding_conv(px)
        x = self.act_fun(x)
        x = self.conv3(x)
        x = self.dropout(x)

        x = self.padding_conv(x)
        x = self.act_fun(x)
        x = self.conv3(x)
        x = self.dropout(x)
        x = x + px
        
        del px

        return x

And this is the code I use to execute it:

import numpy as np
import torchtext.data as data
import torch.nn as nn
import torch

model = Model(channels=128, emb_size=300, dropout=0).cuda()

TEXT = data.Field(batch_first=True, pad_token=None)
LABEL = data.Field(sequential=False,pad_token=None, use_vocab=False)

fields = [('id', None),
       ('category', None),
       ('content', TEXT),
       ('stars', LABEL),
       ('word_count', None)]

train, dev = data.TabularDataset.splits('',
                                   train='train.csv',
                                   validation='dev.csv',
                                   format="csv",
                                   fields=fields,
                                   skip_header=True)

TEXT.build_vocab(train)
emb = nn.Embedding(len(TEXT.vocab), 300).cuda()
emb.weight.requires_grad_(False)

from torch.optim import Adam
train_dl = data.Iterator(train, batch_size=25, repeat=False, shuffle=False)
optimizer = Adam(model.parameters(), lr=1e-3)

for i, batch in enumerate(iter(train_dl)):
    x = batch.content.cuda()
    
    print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
    
    x = emb(x)
    
    print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
    
    y = ((batch.stars >= 7).type(torch.long) + (batch.stars >= 5).type(torch.long)).cuda()
    y_pred = model(x)
    
    crit = nn.CrossEntropyLoss()
    
    optimizer.zero_grad()
    loss = crit(y_pred, y)
    
    loss.backward()
    
    optimizer.step()
    
    break

The output is:

Tensor size: torch.Size([25, 658]), Memory allocated: 586.00634765625 MB
Tensor size: torch.Size([25, 658, 300]), Memory allocated: 604.88134765625 MB
Tensor size: torch.Size([25, 128, 656, 1]), Memory allocated: 613.0068359375 MB
Tensor size: torch.Size([25, 128, 327, 1]), Memory allocated: 645.2568359375 MB
Tensor size: torch.Size([25, 128, 327, 1]), Memory allocated: 653.5068359375 MB
Tensor size: torch.Size([25, 128, 163, 1]), Memory allocated: 664.0068359375 MB
Tensor size: torch.Size([25, 128, 81, 1]), Memory allocated: 673.12060546875 MB
Tensor size: torch.Size([25, 128, 40, 1]), Memory allocated: 676.63623046875 MB
Tensor size: torch.Size([25, 128, 19, 1]), Memory allocated: 678.357421875 MB
Tensor size: torch.Size([25, 128, 9, 1]), Memory allocated: 679.22412109375 MB
Tensor size: torch.Size([25, 128, 4, 1]), Memory allocated: 679.66357421875 MB
Tensor size: torch.Size([25, 128, 1, 1]), Memory allocated: 679.8466796875 MB
Tensor size: torch.Size([25, 128, 1, 1]), Memory allocated: 679.88330078125 MB

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-4-071a17ef7257> in <module>()
     23     loss = crit(y_pred, y)
     24 
---> 25     loss.backward()
     26 
     27     optimizer.step()

~/anaconda3/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
     91                 products. Defaults to ``False``.
     92         """
---> 93         torch.autograd.backward(self, gradient, retain_graph, create_graph)
     94 
     95     def register_hook(self, hook):

~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
     88     Variable._execution_engine.run_backward(
     89         tensors, grad_tensors, retain_graph, create_graph,
---> 90         allow_unreachable=True)  # allow_unreachable flag
     91 
     92 

RuntimeError: CUDA error: out of memory

Can anyone help with this issue?

It looks like you get the OOM error during the backward() call.
The intermediate variables might take too much memory. Could you try to lower the batch size and see if it’s working then?

Of course I can reduce batch size, but the problem is that given the size of batch and model it should easily fit into memory. This is not the only case, when something like this happens to me. Sometimes the problem is even with a model with 17M parameters, batch size of 1 and 4GB GPU memory.

Did you solve the issue? I also encountered a similar problem