I have a really weird issue with my NLP model. My GPU has 4GB memory, the model and embeddings consume about 1GB and my mini batches of size 25 only around 5-10 MB. During first iteration it shows in nvidia-smi 1100 MB and by torch.cuda.memory_allocated() 680 MB of memory usage, but it prints out CUDA out of memory error. What is wrong with the model?
This is my model instance:
import torch
import torch.nn as nn
from torch.autograd import Variable
import torchtext.data as data
import torch.nn.modules.batchnorm as batchnorm
import torch.nn.functional as F
import pdb
class Model(nn.Module):
def __init__(self, **params):
super(Model, self).__init__()
self.channel_size = params['channels']
self.conv_region_embedding = nn.Conv2d(1, self.channel_size, (3, params['emb_size']), stride=1)
self.conv3 = nn.Conv2d(self.channel_size, self.channel_size, (3, 1), stride=1)
self.pooling = nn.MaxPool2d(kernel_size=(3, 1), stride=2)
self.padding_conv = nn.ZeroPad2d((0, 0, 1, 1))
self.padding_pool = nn.ZeroPad2d((0, 0, 0, 1))
self.act_fun = nn.ReLU()
self.linear_out = nn.Linear(self.channel_size, 3)
self.dropout = nn.Dropout2d(params['dropout'])
self.ad_pool = lambda x: F.adaptive_max_pool2d(x, (1, 1)).cuda()
def forward(self, x):
batch = x.shape[0]
x = x.view(x.size(0), 1, x.size(1), x.size(2))
x = self.conv_region_embedding(x)
x = self.dropout(x)
print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
px = self.pooling(x)
x = self.pooling(x)
x = self.padding_conv(x)
x = self.act_fun(x)
x = self.conv3(x)
x = self.dropout(x)
print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
x = self.padding_conv(x)
x = self.act_fun(x)
x = self.conv3(x)
x = self.dropout(x)
print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
x = x + px
del px
while x.size()[-2] > 2:
x = self._block(x)
print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
x = self.ad_pool(x)
print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
x = x.view(batch, self.channel_size)
x = self.linear_out(x)
return x
def _block(self, x):
px = self.pooling(x)
x = self.pooling(x)
# Convolution
x = self.padding_conv(px)
x = self.act_fun(x)
x = self.conv3(x)
x = self.dropout(x)
x = self.padding_conv(x)
x = self.act_fun(x)
x = self.conv3(x)
x = self.dropout(x)
x = x + px
del px
return x
And this is the code I use to execute it:
import numpy as np
import torchtext.data as data
import torch.nn as nn
import torch
model = Model(channels=128, emb_size=300, dropout=0).cuda()
TEXT = data.Field(batch_first=True, pad_token=None)
LABEL = data.Field(sequential=False,pad_token=None, use_vocab=False)
fields = [('id', None),
('category', None),
('content', TEXT),
('stars', LABEL),
('word_count', None)]
train, dev = data.TabularDataset.splits('',
train='train.csv',
validation='dev.csv',
format="csv",
fields=fields,
skip_header=True)
TEXT.build_vocab(train)
emb = nn.Embedding(len(TEXT.vocab), 300).cuda()
emb.weight.requires_grad_(False)
from torch.optim import Adam
train_dl = data.Iterator(train, batch_size=25, repeat=False, shuffle=False)
optimizer = Adam(model.parameters(), lr=1e-3)
for i, batch in enumerate(iter(train_dl)):
x = batch.content.cuda()
print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
x = emb(x)
print('Tensor size: {}, Memory allocated: {} MB'.format(x.size(), torch.cuda.memory_allocated() / 1024**2))
y = ((batch.stars >= 7).type(torch.long) + (batch.stars >= 5).type(torch.long)).cuda()
y_pred = model(x)
crit = nn.CrossEntropyLoss()
optimizer.zero_grad()
loss = crit(y_pred, y)
loss.backward()
optimizer.step()
break
The output is:
Tensor size: torch.Size([25, 658]), Memory allocated: 586.00634765625 MB
Tensor size: torch.Size([25, 658, 300]), Memory allocated: 604.88134765625 MB
Tensor size: torch.Size([25, 128, 656, 1]), Memory allocated: 613.0068359375 MB
Tensor size: torch.Size([25, 128, 327, 1]), Memory allocated: 645.2568359375 MB
Tensor size: torch.Size([25, 128, 327, 1]), Memory allocated: 653.5068359375 MB
Tensor size: torch.Size([25, 128, 163, 1]), Memory allocated: 664.0068359375 MB
Tensor size: torch.Size([25, 128, 81, 1]), Memory allocated: 673.12060546875 MB
Tensor size: torch.Size([25, 128, 40, 1]), Memory allocated: 676.63623046875 MB
Tensor size: torch.Size([25, 128, 19, 1]), Memory allocated: 678.357421875 MB
Tensor size: torch.Size([25, 128, 9, 1]), Memory allocated: 679.22412109375 MB
Tensor size: torch.Size([25, 128, 4, 1]), Memory allocated: 679.66357421875 MB
Tensor size: torch.Size([25, 128, 1, 1]), Memory allocated: 679.8466796875 MB
Tensor size: torch.Size([25, 128, 1, 1]), Memory allocated: 679.88330078125 MB
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-4-071a17ef7257> in <module>()
23 loss = crit(y_pred, y)
24
---> 25 loss.backward()
26
27 optimizer.step()
~/anaconda3/lib/python3.6/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
91 products. Defaults to ``False``.
92 """
---> 93 torch.autograd.backward(self, gradient, retain_graph, create_graph)
94
95 def register_hook(self, hook):
~/anaconda3/lib/python3.6/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
88 Variable._execution_engine.run_backward(
89 tensors, grad_tensors, retain_graph, create_graph,
---> 90 allow_unreachable=True) # allow_unreachable flag
91
92
RuntimeError: CUDA error: out of memory
Can anyone help with this issue?