Pytorch trying to allocate 57 GB of memory when calling bilinear module

Hello, we have created this model below :

from typing import Any

import pytorch_wrapper.functional as pwF

from torch import nn
import torch

class DependencyParsingModel(nn.Module):

    def __init__(self, bert_model : nn.Module, dp):
        """
        :param bert_model:  The bert model nn.Module
        :param dp: the drop out probability
        :param feat_to_size: a dict mapping from a string feature to an int ( the number of outputs for the feature)
        """
        super(DependencyParsingModel, self).__init__()
        self._bert_model = bert_model
        self._dp = nn.Dropout(dp)
        self.bilinear = nn.Bilinear(768, 768, 1)
        self.device = 'cuda'

        self._linear = nn.Linear(2*768 , 1)


    def forward(self, text, text_len, pairs):
        attention_mask = pwF.create_mask_from_length(text_len, text.shape[1])
        bert = self._bert_model(text, attention_mask=attention_mask)
        bert_output = self._dp(bert[0])
        cls_token = bert[1]

        bs = pairs.shape[0]
        mseq = bert_output.shape[1]
        reshaped_bert_output = torch.reshape(bert_output, (bs * mseq, bert_output.shape[2]))

        # creating indexes
        mpair = pairs.shape[1]

        reshaped_pairs = torch.reshape(pairs, (bs * mpair * 2,))

        segment_size = mpair * 2

        repeats = torch.full((bs,), segment_size, dtype=torch.long)
        offset_vector = torch.repeat_interleave(repeats)
        offset_vector = offset_vector.to(device=self.device)

        indices = reshaped_pairs + (offset_vector * mseq)

        # output computation
        final_pairs = torch.index_select(reshaped_bert_output, 0, indices)

        reshaped_final_pairs = torch.reshape(final_pairs, (bs, mpair, 2, 768))


        # Non bilinear computeation


        # Two ways to cmpute arc scores 1) linear 2) bilinear
        #reshaped_final_pairs = torch.reshape(reshaped_final_pairs , (bs , mpair, 2*768))
        #output_linear = self._linear(reshaped_final_pairs)
        output_linear = self.bilinear((reshaped_final_pairs[:, :, 0, :]), (reshaped_final_pairs[:, :, 1, :]))

        output_linear = output_linear.reshape(bs,mseq,mseq)
        #output_linear = torch.squeeze(output_linear,-1)
        return output_linear

If we create the model with the line
output_linear = self.bilinear((reshaped_final_pairs[:, :, 0, :]), (reshaped_final_pairs[:, :, 1, :]))
the following error shows:

RuntimeError: CUDA out of memory. Tried to allocate 57.11 GiB (GPU 0; 7.93 GiB total capacity; 990.33 MiB already allocated; 6.34 GiB free; 1012.00 MiB reserved in total by PyTorch)

also some weird frame numbers like so

frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fed2c1871e2 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1e64b (0x7fed315ec64b in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1f464 (0x7fed315ed464 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x1faa1 (0x7fed315edaa1 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x11e (0x7fecc4fca52e in /home/nsmyrnioudis/env/lib/python3.6/site
-packages/torch/lib/libtorch_cuda.so)


...


frame #31: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7fecfc40b119 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #32: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7fed2c8ec34a in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #33: <unknown function> + 0xc70f (0x7fed3180b70f in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #34: <unknown function> + 0x76ba (0x7fed3e9de6ba in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #35: clone + 0x6d (0x7fed3dbc14dd in /lib/x86_64-linux-gnu/libc.so.6)

However when we use the linear implementation to calculate arc scores
output_linear = self._linear(reshaped_final_pairs)
no error shows and the model continues training.

We have observed that with batch size = 1 and the bilinear call CUDA runs out of memory trying to allocate around 7 GiB. Now with batch size 8 CUDA runs out of memory trying to allocate 57 GiB.

Is there something wrong with our code? Or is it something wrong with the bilinear implementation?

Thank you

Hi,

If you have a linear of size 768 -> 1 than the weights matrix will be of size 768
But if you have a bilinear from 768x768 -> 1, then the weight matrix will be 768 * 768 which is much bigger.
And so, depending on the batch size, it will consume a lot more memory.