Cuda Out of Memory

ritchieng · February 12, 2017, 2:45am

Barely a few steps through my forward propagation for an LSTM I received an error:

THCudaCheck FAIL file=/home/soumith/local/builder/wheel/pytorch-src/torch/lib/THC/generic/THCStorage.cu line=66 error=2 : out of memory
Traceback (most recent call last):
  File "external_script.py", line 1002, in <module>
    final_loss = run()
  File "external_script.py", line 584, in run
    optimizer_iter_num, feature, x_dim)
  File "/home/ubuntu/lstm_special/rnn.py", line 74, in lstm_forward
    output, hn = self.net.rnn(input, (self.h, self.c))
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/module.py", line 210, in __call__
    result = self.forward(*input, **kwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/modules/rnn.py", line 79, in forward
    return func(input, self.all_weights, hx)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/_functions/rnn.py", line 228, in forward
    return func(input, *fargs, **fkwargs)
  File "/usr/local/lib/python2.7/dist-packages/torch/autograd/function.py", line 202, in _do_forward
    flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
  File "/usr/local/lib/python2.7/dist-packages/torch/autograd/function.py", line 218, in forward
    result = self.forward_extended(*nested_tensors)
  File "/usr/local/lib/python2.7/dist-packages/torch/nn/_functions/rnn.py", line 180, in forward_extended
    cudnn.rnn.forward(self, input, hx, weight, output, hy)
  File "/usr/local/lib/python2.7/dist-packages/torch/backends/cudnn/rnn.py", line 257, in forward
    fn.workspace = torch.cuda.ByteTensor(workspace_size.value)
RuntimeError: cuda runtime error (2) : out of memory at /home/soumith/local/builder/wheel/pytorch-src/torch/lib/THC/generic/THCStorage.cu:66

I’ve a 12gb Tesla K80 NVIDIA GPU so this shouldn’t be an issue I believe it has something to do with the variables. 12GB RAM.

Checking CUDA is working.

>>> torch.cuda.is_available()
True
>>> torch.cuda.current_stream()
<torch.cuda.Stream device=0 cuda_stream=0x0>
>>> torch.cuda.device_count()
1L

NVIDIA SMI

+-----------------------------------------------------------------------------+
| NVIDIA-SMI 367.57                 Driver Version: 367.57                    |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|===============================+======================+======================|
|   0  Tesla K80           Off  | 0000:00:1E.0     Off |                    0 |
| N/A   51C    P8    27W / 149W |      2MiB / 11439MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                       GPU Memory |
|  GPU       PID  Type  Process name                               Usage      |
|=============================================================================|
|  No running processes found                                                 |
+-----------------------------------------------------------------------------+

Script

from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.optim as optim
import matplotlib as mpl
import numpy as np

torch.manual_seed(0)


class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bias, dropout):
        super(Net, self).__init__()
        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bias=bias,
                           dropout=dropout)


class lstmModule:
    def __init__(self, input_size, hidden_size, num_layers, bias, dropout,
                 seq_len, batch_size, meta_lr, n_meta_iter):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.dropout = dropout
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.meta_lr = meta_lr
        self.n_meta_iter = n_meta_iter

        self.net = Net(input_size=input_size,
                       hidden_size=hidden_size,
                       num_layers=num_layers,
                       bias=bias,
                       dropout=dropout)

        self.net.cuda()

        self.h0 = Variable(torch.randn(self.num_layers,
                                       self.batch_size,
                                       self.hidden_size)).cuda()

        self.c0 = Variable(torch.randn(self.num_layers,
                                       self.batch_size,
                                       self.hidden_size)).cuda()

        self.optimizer = optim.Adam(self.net.rnn.parameters(), lr=self.meta_lr)

        self.loss_lst = []
        self.loss_lst2 = []

    def lstm_forward(self, seq_num, inp, x_dim):
        inp = inp.reshape(1, 1, inp.shape[0]*inp.shape[1])
        inp = torch.from_numpy(inp).float()
        input = Variable(inp).cuda()

        if seq_num == 0:
            # Ensure clear gradient buffer
            self.optimizer.zero_grad()
            self.loss_tot = [0 for i in range(self.hidden_size)]

            # LSTM
            output, hn = self.net.rnn(input, (self.h0, self.c0))
            output = torch.abs(2 * output)
            op = [output[:, :, i] for i in range(self.hidden_size)]
            self.output_all = op
            self.h, self.c = hn
            return output.cpu().data.numpy()
        else:
            output, hn = self.net.rnn(input, (self.h, self.c))
            output = torch.abs(2 * output)
            op = [output[:, :, i] for i in range(self.hidden_size)]
            self.h, self.c = hn
            self.output_all = [torch.cat((self.output_all[i], op[i]), 0) for i in range(self.hidden_size)]
            return output.cpu().data.numpy()

    def lstm_update(self, lab):
        def pseudo_loss(output, label):
            # print('output size', output.size())
            # print('label size', label.size())
            # return torch.mean(torch.abs(output*label))
            return torch.mean(output * label)

        lab = torch.from_numpy(lab).float()
        self.label = Variable(lab).cuda()

        # Get loss
        self.loss_tot = [
            self.loss_tot[i] + pseudo_loss(self.output_all[i], self.label[:, i]) for i in range(self.hidden_size)]

        # Append loss
        self.loss_lst.append(self.loss_tot[0].cpu().data.numpy()[0])
        self.loss_lst2.append(self.loss_tot[1].cpu().data.numpy()[0])

        # Backprop
        sum(self.loss_tot).backward()

        # Update optimizer
        self.optimizer.step()

        return self.loss_lst, self.loss_lst2

I’ve an external script that calls lstm_forward while feeding it with a tensor of size (1, 1, 7520).
This shouldn’t occur (on my CPU version of PyTorch it didn’t) and the forward pass is really fast.

apaszke · February 12, 2017, 1:36pm

It’s probably because you’re caching a lot of intermediate results (e.g. output_all, h, c, loss_tot, etc.) and they prevent the graph from being freed. Also, if you want to run in inference mode only use volatile=True.

apaszke · February 12, 2017, 1:37pm

If you know you’re not going to use the intermediate Variables for backpropagation, it’s safer to keep references to tensors only.

ritchieng · February 12, 2017, 1:45pm

What do you mean by keeping references to tensors only?

apaszke · February 12, 2017, 1:46pm

When you do this:

self.output_all = op

op is a list of Variables - i.e. wrappers around tensors that also keep the history and that history is what you’re never going to use, and it’ll only end up consuming memory. If you do that

self.output_all = [o.data for o in op]

you’ll only save the tensors i.e. the final values.

Also, as I said. Use volatile=True if you’re going to run in inference mode. This will save a looot of memory.

ritchieng · February 12, 2017, 1:50pm

I’m starting to get the hang of how Variables work in PyTorch. Thanks. I’ll try it freeing some variables and let you know.

ritchieng · February 12, 2017, 2:54pm

Tried your recommendation that made the process super fast but it prevented me from backpropagating because the resulting loss is not a variable type but a float type (no history).

I’m struggling to understand why it’s running out of memory with 12gb. @apaszke I’m thinking there’s a bug in PyTorch. When I run htop, it’s only taking up 2gb+. Somehow there’s something triggering the errors.

I was running the other CPU version with a larger dataset and this came out:

 File "/home/ubuntu/anaconda2/lib/python2.7/site-packages/torch/optim/adam.py", line 52, in step
    state['exp_avg_sq'] = grad.new().resize_as_(grad).zero_()
RuntimeError: $ Torch: not enough memory: you tried to allocate 7GB. Buy new RAM! at /data/users/soumith/miniconda2/conda-bld/pytorch-cuda80-0.1.8_1486039719409/work/torch/lib/TH/THGeneral.c:270

This is weird considering how I’ve more than 60GB RAM. May I know if PyTorch is limiting the amount of RAM somehow?

apaszke · February 12, 2017, 3:25pm

No, it means that the allocation has failed - you didn’t have enough free RAM at that moment. Since you’re running low even on CPU memory it doesn’t seem surprising for me that it also fails on the GPU. If you think there’s a bug, it would be very helpful if you could isolate the minimal portion of the code that would allow us to reproduce the error.

ritchieng · February 12, 2017, 3:43pm

This is self-contained script where you can run with python test_rnn.py.

It works with small number of hidden states on line 178 like 100 to even 1000. But once it reaches like 10000 and above which is what I need, it gets problematic.

'''
GPU LSTM
    1 single input
    1 single output
'''

from torch.autograd import Variable
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import matplotlib as mpl
mpl.use('Agg')
import matplotlib.pyplot as plt


class Net(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, bias, dropout):
        super(Net, self).__init__()
        self.rnn = nn.LSTM(input_size=input_size,
                           hidden_size=hidden_size,
                           num_layers=num_layers,
                           bias=bias,
                           dropout=dropout)


def input_var(i):
    test = np.array([i])
#     print(test.shape)
    # test = np.array([i])
    input_var = test.reshape(1, 1, 1)  # (seq_len, batch, input_size)
    input_var = torch.from_numpy(input_var).float()
    return input_var


def label_var(i):
    test = np.array([i*4])
    label_var = test.reshape(1, 1)  #
    label_var = torch.from_numpy(label_var).float()
    return label_var


class lstmModule:
    def __init__(self, input_size, hidden_size, num_layers, bias, dropout,
                 seq_len, batch_size, meta_lr, n_meta_iter):
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.bias = bias
        self.dropout = dropout
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.meta_lr = meta_lr
        self.n_meta_iter = n_meta_iter

        self.net = Net(input_size=input_size,
                       hidden_size=hidden_size,
                       num_layers=num_layers,
                       bias=bias,
                       dropout=dropout)

        self.net.cuda()

        self.h0 = Variable(torch.randn(self.num_layers,
                                       self.batch_size,
                                       self.hidden_size)).cuda()

        self.c0 = Variable(torch.randn(self.num_layers,
                                       self.batch_size,
                                       self.hidden_size)).cuda()

        self.optimizer = optim.Adam(self.net.rnn.parameters(), lr=self.meta_lr)

        self.loss_lst = []

    def lstm_forward(self, seq_num, meta_num):
        def pseudo_loss(output, label):
            return torch.mean(torch.sum(torch.abs(output - label)))

        inp = input_var(seq_num)
        input = Variable(inp).cuda()

        lab = label_var(seq_num)
        label = Variable(lab).cuda()

        if seq_num == 0:

            # Ensure clear gradient buffer
            self.optimizer.zero_grad()
            self.loss_tot = [0 for i in range(self.hidden_size)]

            # Label concatenation
            self.label_all = label

            # LSTM
            output, hn = self.net.rnn(input, (self.h0, self.c0))
            output = 100 * output

            op = [output[:, :, i] for i in range(self.hidden_size)]

            self.output_all = op
            #             print('1 step length:', len(self.output_all))
            self.h, self.c = hn
        else:
            self.label_all = torch.cat((self.label_all, label), 0)
            output, hn = self.net.rnn(input, (self.h, self.c))
            output = 100 * output
            op = [output[:, :, i] for i in range(self.hidden_size)]
            self.h, self.c = hn
            self.output_all = [torch.cat((self.output_all[i], op[i]), 0) for i in range(self.hidden_size)]

        if seq_num == (self.seq_len - 1):
            # Get loss
            self.loss_tot = [self.loss_tot[i] + pseudo_loss(self.output_all[i], self.label_all) for i in range(self.hidden_size)]

            # Append loss
            self.loss_lst.append(sum(self.loss_tot).cpu().data.numpy()[0])

            # Backprop
            sum(self.loss_tot).backward()

            # Update optimizer
            self.optimizer.step()

        if seq_num == (self.seq_len - 1) and meta_num == (self.n_meta_iter - 1):
            # print(len(self.loss_lst))
            print('Loss 1', self.loss_tot[0].cpu().data.numpy())
            print('Loss 2', self.loss_tot[1].cpu().data.numpy())
            plt.clf()
            plt.plot()
            plt.title('Loss Curve')
            plt.plot(self.loss_lst, label='Loss Curve')
            plt.legend(loc='best')
            plt.savefig('loss.png')

    def lstm_check(self, seq_num):
        inp = input_var(seq_num)
        input = Variable(inp).cuda()
        lab = label_var(seq_num)
        label = Variable(lab).cuda()

        if seq_num == 0:
            # Ensure clear gradient buffer
            self.optimizer.zero_grad()
            self.loss_tot = [0 for i in range(self.hidden_size)]

            # Label concatenation
            self.label_all = label

            # LSTM
            output, hn = self.net.rnn(input, (self.h0, self.c0))
            output = 100 * output
            op = [output[:, :, i] for i in range(self.hidden_size)]
            self.output_all = op
            self.h, self.c = hn
        else:
            self.label_all = torch.cat((self.label_all, label), 0)
            output, hn = self.net.rnn(input, (self.h, self.c))
            output = 100 * output
            op = [output[:, :, i] for i in range(self.hidden_size)]
            self.h, self.c = hn
            self.output_all = [torch.cat((self.output_all[i], op[i]), 0) for i in range(self.hidden_size)]

        if seq_num == (self.seq_len - 1):
            print('-' * 10)
            print(self.output_all[0].cpu().data.numpy())
            print(self.label_all.cpu().data.numpy())
            print('-' * 10)
            print(self.output_all[1].cpu().data.numpy())
            print(self.label_all.cpu().data.numpy())

N_meta = 10
LR_meta = 0.1
N_seq = 4
batch_size = 1
layers = 4
input_size = 1
hidden_size = 15000

# Initialize and assign class to object once
# input_size, hidden_size, num_layers, bias, dropout, seq_len, batch_size, meta_lr, n_meta_iter):
print 'Initializing LSTM'
lstm = lstmModule(input_size, hidden_size, layers, True, 0.1, N_seq, batch_size, LR_meta, N_meta)
print 'Initialized LSTM'

# Run through meta iterations
print 'Training'
for j in range(N_meta):
    print('Meta iteration', j)
    # Run through each step
    for i in range(N_seq):
        lstm.lstm_forward(i, j)
print 'Done Training'

# Check
print('-' * 10)
print 'Checking'
for i in range(N_seq):
    lstm.lstm_check(i)
print 'Done Checking'

apaszke · February 12, 2017, 3:51pm

And why do you think it should work with a history size of 10000? That requires a lot of stuff to keep around. You can’t expect the framework to work with arbitrarily large inputs, and that size sounds like something that possibly could raise an OOM error.

ritchieng · February 12, 2017, 4:52pm

Yeah. Another paper implemented such a way which requires x number of hidden states for x number of parameters: https://arxiv.org/abs/1606.04474

Was wondering if you’ve any suggestions on saving memory?

apaszke · February 12, 2017, 5:30pm

Only thing that comes to my mind is smart usage of requires_grad and volatile. I’d recommend to read these notes.

spacemeerkat · October 1, 2018, 9:19am

I’m having a similar CUDA memory issue but I’m only running a shallow CNN…I was wondering if anyone could provide any insight as to where this memory leak is?

My model is pretty small:

class RegressionalNet(torch.nn.Module):
    
    def __init__(self):
        super().__init__()
        self.feature_extractor = torch.nn.Sequential(
            
            torch.nn.Conv2d(1,64,5,padding=2), 
            torch.nn.ReLU(),
            torch.nn.Conv2d(64,128,5,padding=2,bias=False),
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(128),
            torch.nn.MaxPool2d(2),
            torch.nn.Conv2d(128,256,5,padding=2,bias=False),
            torch.nn.ReLU(),
            torch.nn.BatchNorm2d(256),
            torch.nn.MaxPool2d(2))
 
        self.classifier = torch.nn.Sequential(
	        torch.nn.Dropout(0.5),
            torch.nn.Linear(256*16*16,256), # Fully connected layer 
            torch.nn.ReLU(),            
            torch.nn.Linear(256,1))
        
    def forward(self,x):
        features = self.feature_extractor(x)
        output = self.classifier(features.view(int(x.size()[0]),-1))
        return output

My training procedure looks as follows:

def train(
          model: torch.nn.Module, 
          transforms, 
          data_path= DATA_PATH, 
          val_path= VAL_PATH, 
          num_epochs=101, 
          batch_size=64, 
          verbose=True,
          cube_length=640, img_size=(64, 64), 
          loss=torch.nn.MSELoss(), 
          lr_schedule=True, initial_lr=1e-3, suffix=""):

    data_path = os.path.abspath(data_path)
    val_path = os.path.abspath(val_path)	
    model = model.train()
    device = torch.device("cuda")
    model = model.to(device).to(torch.float)
    
    """ LOADING IN THE REAL TRAINING AND VALIDATION DATASETS """
    
    loader = DataLoader(FITSCubeDataset(data_path, cube_length, transforms, img_size), 
                        batch_size=batch_size, shuffle=True)                        
    validation_loader = DataLoader(FITSCubeDataset(val_path, cube_length, transforms, img_size), 
                                   batch_size=batch_size, shuffle=True)                                     
    
    optim = torch.optim.Adam(model.parameters(), initial_lr)	
    accuracies, val_accuracies, epochs, val_epochs = [0], [0], [0], [0]
	
    for i in range(num_epochs):
        print("Epoch %d of %d" % (i+1, num_epochs))
        _accuracies,_val_accuracies = [],[]    
        model.train(True) 
        for idx, (batch, target) in enumerate(tqdm(loader)):
            batch = batch.to(device).to(torch.float)
            if isinstance(loss, torch.nn.CrossEntropyLoss):
                target = target.to(device).to(torch.long)
            else:
                target = target.to(device).to(torch.float)
            pred = model(batch).reshape(-1)
            loss_value = loss(pred, target)
            optim.zero_grad()
            loss_value.backward()
            optim.step()
                
            ###Change the error metric here###

            _accuracies.append(loss_value)   
        epochs.append(i+1)                
        mean_accuracy = sum(_accuracies)/len(_accuracies)
        accuracies.append(mean_accuracy)
        print("Mean training loss: %f" % mean_accuracy)

I’ve checked using watch -n 0.5 nvidia-smi to see if it really is a GPU memory issue and it does max out on epoch 1 when training on batch number of 606 every time.

justusschock · October 1, 2018, 9:24am

You need to do

_accuracies.append(loss_value.detach())

because otherwise you are saving the complete computation graph for every sample which causes the OOM

spacemeerkat · October 1, 2018, 9:27am

Oh wow yeah I can see how that would take up the memory very quickly many thanks! It’s working perfectly now

maqboolurrahim · September 25, 2021, 9:54pm

In my case, I followed @justusschock advice and the issue of Out of Memory was gone.
To understand detach vs item, I would recommend this Pytorch Forum discussion.