CUDA error: out of memory for Multi GPU servers

Dear @All

I’m trying to apply Transformer tutorial from Harvardnlp, I have 4 GPUs server and, I got CUDA error: out of memory for 512 batch size.

# For data loading.
from torchtext import data, datasets

if True:

    BOS_WORD = '<s>'
    EOS_WORD = '</s>'
    BLANK_WORD = "<blank>"
    SRC = data.Field(tokenize=bpemb_ar.encode, pad_token=BLANK_WORD, fix_length = 50)
    TRG = data.Field(tokenize=bpemb_ar.encode, init_token = BOS_WORD, 
                     eos_token = EOS_WORD, pad_token=BLANK_WORD, fix_length = 50)

    MAX_LEN = 100
    train, val, test = TabularDataset.splits(path='./data/',train='SCUT_trai.csv',
    validation='SCUT_vali.csv', test='test.csv', format='csv',
    fields=[('src', SRC), ('trg', TRG)], skip_header=True, 
        filter_pred=lambda x: len(vars(x)['src']) <= MAX_LEN and 
            len(vars(x)['trg']) <= MAX_LEN)
    
    MIN_FREQ = 2
    SRC.build_vocab(train.src, min_freq=MIN_FREQ)
    TRG.build_vocab(train.trg, min_freq=MIN_FREQ)
class MultiGPULossCompute:
    "A multi-gpu loss compute and train function."
    def __init__(self, generator, criterion, devices, opt=None, chunk_size=5):
        # Send out to different gpus.
        self.generator = generator
        self.criterion = nn.parallel.replicate(criterion, 
                                               devices=devices)
        self.opt = opt
        self.devices = devices
        self.chunk_size = chunk_size
        
    def __call__(self, out, targets, normalize):
        total = 0.0
        generator = nn.parallel.replicate(self.generator, 
                                                devices=self.devices)
        out_scatter = nn.parallel.scatter(out, 
                                          target_gpus=self.devices)
        out_grad = [[] for _ in out_scatter]
        targets = nn.parallel.scatter(targets, 
                                      target_gpus=self.devices)

        # Divide generating into chunks.
        chunk_size = self.chunk_size
        for i in range(0, out_scatter[0].size(1), chunk_size):
            # Predict distributions
            out_column = [[Variable(o[:, i:i+chunk_size].data, 
                                    requires_grad=self.opt is not None)] 
                           for o in out_scatter]
            gen = nn.parallel.parallel_apply(generator, out_column)

            # Compute loss. 
            y = [(g.contiguous().view(-1, g.size(-1)), 
                  t[:, i:i+chunk_size].contiguous().view(-1)) 
                 for g, t in zip(gen, targets)]
            loss = nn.parallel.parallel_apply(self.criterion, y)

            # Sum and normalize loss
            l = nn.parallel.gather(loss, 
                                   target_device=self.devices[0])
            l = l.sum() / normalize
            total += l.data

            # Backprop loss to output of transformer
            if self.opt is not None:
                l.backward()
                for j, l in enumerate(loss):
                    out_grad[j].append(out_column[j][0].grad.data.clone())

        # Backprop all loss through transformer.            
        if self.opt is not None:
            out_grad = [Variable(torch.cat(og, dim=1)) for og in out_grad]
            o1 = out
            o2 = nn.parallel.gather(out_grad, 
                                    target_device=self.devices[0])
            o1.backward(gradient=o2)
            self.opt.step()
            self.opt.optimizer.zero_grad()
        return total * normalize
# GPUs to use
devices = [0, 1, 2, 3]
if True:
    pad_idx = TRG.vocab.stoi["<blank>"]
    model = make_model(len(SRC.vocab), len(TRG.vocab), N=6)
    model.cuda()
    criterion = LabelSmoothing(size=len(TRG.vocab), padding_idx=pad_idx, smoothing=0.1)
    criterion.cuda()
    BATCH_SIZE = 256 #12000
    train_iter = MyIterator(train, batch_size=BATCH_SIZE, device=0,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=True)
    
    valid_iter = MyIterator(val, batch_size=BATCH_SIZE, device=0,
                            repeat=False, sort_key=lambda x: (len(x.src), len(x.trg)),
                            batch_size_fn=batch_size_fn, train=False)
    
    model_par = nn.DataParallel(model, device_ids=devices)
    # torch.device('cuda') 
None
/home/akram/anaconda3/lib/python3.6/site-packages/ipykernel_launcher.py:20: UserWarning: nn.init.xavier_uniform is now deprecated in favor of nn.init.xavier_uniform_.

---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-40-7da75baf3635> in <module>
      4     pad_idx = TRG.vocab.stoi["<blank>"]
      5     model = make_model(len(SRC.vocab), len(TRG.vocab), N=6)
----> 6     model.cuda()
      7     criterion = LabelSmoothing(size=len(TRG.vocab), padding_idx=pad_idx, smoothing=0.1)
      8     criterion.cuda()

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in cuda(self, device)
    263             Module: self
    264         """
--> 265         return self._apply(lambda t: t.cuda(device))
    266 
    267     def cpu(self):

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    191     def _apply(self, fn):
    192         for module in self.children():
--> 193             module._apply(fn)
    194 
    195         for param in self._parameters.values():

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    191     def _apply(self, fn):
    192         for module in self.children():
--> 193             module._apply(fn)
    194 
    195         for param in self._parameters.values():

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    191     def _apply(self, fn):
    192         for module in self.children():
--> 193             module._apply(fn)
    194 
    195         for param in self._parameters.values():

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    191     def _apply(self, fn):
    192         for module in self.children():
--> 193             module._apply(fn)
    194 
    195         for param in self._parameters.values():

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    191     def _apply(self, fn):
    192         for module in self.children():
--> 193             module._apply(fn)
    194 
    195         for param in self._parameters.values():

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    191     def _apply(self, fn):
    192         for module in self.children():
--> 193             module._apply(fn)
    194 
    195         for param in self._parameters.values():

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in _apply(self, fn)
    197                 # Tensors stored in modules are graph leaves, and we don't
    198                 # want to create copy nodes, so we have to unpack the data.
--> 199                 param.data = fn(param.data)
    200                 if param._grad is not None:
    201                     param._grad.data = fn(param._grad.data)

~/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in <lambda>(t)
    263             Module: self
    264         """
--> 265         return self._apply(lambda t: t.cuda(device))
    266 
    267     def cpu(self):

RuntimeError: CUDA error: out of memory

The whole code copy from the original article, any suggestions?

Try to reduce the batch size and check which batch size would fit and how much memory is used.
Your GPUs might have not enough memory for the code you are using or are you using exactly the setup the authors were also using?

You’re right Mr. @ptrblck, the problem is that GPU 0 shared with some other users.