Pytorch trying to allocate 57 GB of memory when calling bilinear module

Hello, we have created this model below :

from typing import Any

import pytorch_wrapper.functional as pwF

from torch import nn
import torch

class DependencyParsingModel(nn.Module):

    def __init__(self, bert_model : nn.Module, dp):
        """
        :param bert_model:  The bert model nn.Module
        :param dp: the drop out probability
        :param feat_to_size: a dict mapping from a string feature to an int ( the number of outputs for the feature)
        """
        super(DependencyParsingModel, self).__init__()
        self._bert_model = bert_model
        self._dp = nn.Dropout(dp)
        self.bilinear = nn.Bilinear(768, 768, 1)
        self.device = 'cuda'

        self._linear = nn.Linear(2*768 , 1)


    def forward(self, text, text_len, pairs):
        attention_mask = pwF.create_mask_from_length(text_len, text.shape[1])
        bert = self._bert_model(text, attention_mask=attention_mask)
        bert_output = self._dp(bert[0])
        cls_token = bert[1]

        bs = pairs.shape[0]
        mseq = bert_output.shape[1]
        reshaped_bert_output = torch.reshape(bert_output, (bs * mseq, bert_output.shape[2]))

        # creating indexes
        mpair = pairs.shape[1]

        reshaped_pairs = torch.reshape(pairs, (bs * mpair * 2,))

        segment_size = mpair * 2

        repeats = torch.full((bs,), segment_size, dtype=torch.long)
        offset_vector = torch.repeat_interleave(repeats)
        offset_vector = offset_vector.to(device=self.device)

        indices = reshaped_pairs + (offset_vector * mseq)

        # output computation
        final_pairs = torch.index_select(reshaped_bert_output, 0, indices)

        reshaped_final_pairs = torch.reshape(final_pairs, (bs, mpair, 2, 768))


        # Non bilinear computeation


        # Two ways to cmpute arc scores 1) linear 2) bilinear
        #reshaped_final_pairs = torch.reshape(reshaped_final_pairs , (bs , mpair, 2*768))
        #output_linear = self._linear(reshaped_final_pairs)
        output_linear = self.bilinear((reshaped_final_pairs[:, :, 0, :]), (reshaped_final_pairs[:, :, 1, :]))

        output_linear = output_linear.reshape(bs,mseq,mseq)
        #output_linear = torch.squeeze(output_linear,-1)
        return output_linear

If we create the model with the line
output_linear = self.bilinear((reshaped_final_pairs[:, :, 0, :]), (reshaped_final_pairs[:, :, 1, :]))
the following error shows:

RuntimeError: CUDA out of memory. Tried to allocate 57.11 GiB (GPU 0; 7.93 GiB total capacity; 990.33 MiB already allocated; 6.34 GiB free; 1012.00 MiB reserved in total by PyTorch)

also some weird frame numbers like so

frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fed2c1871e2 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1e64b (0x7fed315ec64b in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1f464 (0x7fed315ed464 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x1faa1 (0x7fed315edaa1 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x11e (0x7fecc4fca52e in /home/nsmyrnioudis/env/lib/python3.6/site
-packages/torch/lib/libtorch_cuda.so)


...


frame #31: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7fecfc40b119 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #32: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7fed2c8ec34a in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #33: <unknown function> + 0xc70f (0x7fed3180b70f in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #34: <unknown function> + 0x76ba (0x7fed3e9de6ba in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #35: clone + 0x6d (0x7fed3dbc14dd in /lib/x86_64-linux-gnu/libc.so.6)

However when we use the linear implementation to calculate arc scores
output_linear = self._linear(reshaped_final_pairs)
no error shows and the model continues training.

We have observed that with batch size = 1 and the bilinear call CUDA runs out of memory trying to allocate around 7 GiB. Now with batch size 8 CUDA runs out of memory trying to allocate 57 GiB.

Is there something wrong with our code? Or is it something wrong with the bilinear implementation?

Thank you

1 Like

Hi,

If you have a linear of size 768 -> 1 than the weights matrix will be of size 768
But if you have a bilinear from 768x768 -> 1, then the weight matrix will be 768 * 768 which is much bigger.
And so, depending on the batch size, it will consume a lot more memory.