I tried to run a model on colab and I have this error which seems to be really weird(256.00 GiB !!) same error occurred if I change the data size, the batch size, or clear the GPU memory.
the main model is a self-attention module (data is images)
here is the Traceback error :
Traceback (most recent call last):
File "./train.py", line 169, in <module>
miou_current = val(opt, model)
File "./train.py", line 86, in val
score = model.test(val=True) # run inference
File "/content/gdrive/My Drive/Colab Notebooks/STANet-withpth/models/CDFA_model.py", line 72, in test
self.forward()
File "/content/gdrive/My Drive/Colab Notebooks/STANet-withpth/models/CDFA_model.py", line 90, in forward
self.feat_A, self.feat_B = self.netA(self.feat_A,self.feat_B)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/content/gdrive/My Drive/Colab Notebooks/STANet-withpth/models/backbone.py", line 46, in forward
x = self.Self_Att(x)
File "/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "/content/gdrive/My Drive/Colab Notebooks/STANet-withpth/models/BAM.py", line 37, in forward
energy = torch.bmm(proj_query, proj_key) # transpose check
RuntimeError: CUDA out of memory. Tried to allocate 256.00 GiB (GPU 0; 14.76 GiB total capacity; 824.42 MiB already allocated; 11.68 GiB free; 1.80 GiB reserved in total by PyTorch)
import torch
import torch.nn.functional as F
from torch import nn
class BAM(nn.Module):
""" Basic self-attention module
"""
def __init__(self, in_dim, ds=8, activation=nn.ReLU):
super(BAM, self).__init__()
self.chanel_in = in_dim
self.key_channel = self.chanel_in //8
self.activation = activation
self.ds = ds #
self.pool = nn.AvgPool2d(self.ds)
print('ds: ',ds)
# to produces the 3 tensors
self.query_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
self.key_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim // 8, kernel_size=1)
self.value_conv = nn.Conv2d(in_channels=in_dim, out_channels=in_dim, kernel_size=1)
#///////////////////////////////////
#
self.gamma = nn.Parameter(torch.zeros(1))
self.softmax = nn.Softmax(dim=-1)
# Softmax function to an n-dimensional input Tensor rescaling them so that
#the elements of the n-dimensional output Tensor lie in the range [0,1]
#and sum to 1.
def forward(self, input):
"""
inputs :
x : input feature maps( B X C X W X H)
returns :
out : self attention value + input feature
attention: B X N X N (N is Width*Height)
"""
x = self.pool(input)
m_batchsize, C, width, height = x.size()
proj_query = self.query_conv(x).view(m_batchsize, -1, width * height).permute(0, 2, 1) # B X C X (N)/(ds*ds)
proj_key = self.key_conv(x).view(m_batchsize, -1, width * height) # B X C x (*W*H)/(ds*ds)
energy = torch.bmm(proj_query, proj_key) # transpose check
energy = (self.key_channel**-.5) * energy
attention = self.softmax(energy) # BX (N) X (N)/(ds*ds)/(ds*ds)
proj_value = self.value_conv(x).view(m_batchsize, -1, width * height) # B X C X N
out = torch.bmm(proj_value, attention.permute(0, 2, 1))
out = out.view(m_batchsize, C, width, height)
out = F.interpolate(out, [width*self.ds,height*self.ds])
out = out + input
return out
batch_size = 8 , C =64 , N=128*64
Anyone had the same error or know how to deal with it or even explain me why this is happening ??