Hello, we have created this model below :
from typing import Any
import pytorch_wrapper.functional as pwF
from torch import nn
import torch
class DependencyParsingModel(nn.Module):
def __init__(self, bert_model : nn.Module, dp):
"""
:param bert_model: The bert model nn.Module
:param dp: the drop out probability
:param feat_to_size: a dict mapping from a string feature to an int ( the number of outputs for the feature)
"""
super(DependencyParsingModel, self).__init__()
self._bert_model = bert_model
self._dp = nn.Dropout(dp)
self.bilinear = nn.Bilinear(768, 768, 1)
self.device = 'cuda'
self._linear = nn.Linear(2*768 , 1)
def forward(self, text, text_len, pairs):
attention_mask = pwF.create_mask_from_length(text_len, text.shape[1])
bert = self._bert_model(text, attention_mask=attention_mask)
bert_output = self._dp(bert[0])
cls_token = bert[1]
bs = pairs.shape[0]
mseq = bert_output.shape[1]
reshaped_bert_output = torch.reshape(bert_output, (bs * mseq, bert_output.shape[2]))
# creating indexes
mpair = pairs.shape[1]
reshaped_pairs = torch.reshape(pairs, (bs * mpair * 2,))
segment_size = mpair * 2
repeats = torch.full((bs,), segment_size, dtype=torch.long)
offset_vector = torch.repeat_interleave(repeats)
offset_vector = offset_vector.to(device=self.device)
indices = reshaped_pairs + (offset_vector * mseq)
# output computation
final_pairs = torch.index_select(reshaped_bert_output, 0, indices)
reshaped_final_pairs = torch.reshape(final_pairs, (bs, mpair, 2, 768))
# Non bilinear computeation
# Two ways to cmpute arc scores 1) linear 2) bilinear
#reshaped_final_pairs = torch.reshape(reshaped_final_pairs , (bs , mpair, 2*768))
#output_linear = self._linear(reshaped_final_pairs)
output_linear = self.bilinear((reshaped_final_pairs[:, :, 0, :]), (reshaped_final_pairs[:, :, 1, :]))
output_linear = output_linear.reshape(bs,mseq,mseq)
#output_linear = torch.squeeze(output_linear,-1)
return output_linear
If we create the model with the line
output_linear = self.bilinear((reshaped_final_pairs[:, :, 0, :]), (reshaped_final_pairs[:, :, 1, :]))
the following error shows:
RuntimeError: CUDA out of memory. Tried to allocate 57.11 GiB (GPU 0; 7.93 GiB total capacity; 990.33 MiB already allocated; 6.34 GiB free; 1012.00 MiB reserved in total by PyTorch)
also some weird frame numbers like so
frame #0: c10::Error::Error(c10::SourceLocation, std::string) + 0x42 (0x7fed2c1871e2 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10.so)
frame #1: <unknown function> + 0x1e64b (0x7fed315ec64b in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #2: <unknown function> + 0x1f464 (0x7fed315ed464 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #3: <unknown function> + 0x1faa1 (0x7fed315edaa1 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libc10_cuda.so)
frame #4: at::native::empty_cuda(c10::ArrayRef<long>, c10::TensorOptions const&, c10::optional<c10::MemoryFormat>) + 0x11e (0x7fecc4fca52e in /home/nsmyrnioudis/env/lib/python3.6/site
-packages/torch/lib/libtorch_cuda.so)
...
frame #31: torch::autograd::Engine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x89 (0x7fecfc40b119 in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch_cpu.so)
frame #32: torch::autograd::python::PythonEngine::thread_init(int, std::shared_ptr<torch::autograd::ReadyQueue> const&, bool) + 0x4a (0x7fed2c8ec34a in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch_python.so)
frame #33: <unknown function> + 0xc70f (0x7fed3180b70f in /home/nsmyrnioudis/env/lib/python3.6/site-packages/torch/lib/libtorch.so)
frame #34: <unknown function> + 0x76ba (0x7fed3e9de6ba in /lib/x86_64-linux-gnu/libpthread.so.0)
frame #35: clone + 0x6d (0x7fed3dbc14dd in /lib/x86_64-linux-gnu/libc.so.6)
However when we use the linear implementation to calculate arc scores
output_linear = self._linear(reshaped_final_pairs)
no error shows and the model continues training.
We have observed that with batch size = 1 and the bilinear call CUDA runs out of memory trying to allocate around 7 GiB. Now with batch size 8 CUDA runs out of memory trying to allocate 57 GiB.
Is there something wrong with our code? Or is it something wrong with the bilinear implementation?
Thank you