Traceback (most recent call last):
File “check1.py”, line 317, in
main()
File “check1.py”, line 172, in main
iteration = train(epoch,iteration)
File “check1.py”, line 128, in train
pred = model(img)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “./model/model.py”, line 155, in forward
x = self.backend(x)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/container.py”, line 100, in forward
input = module(input)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/container.py”, line 100, in forward
input = module(input)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torchvision/models/resnet.py”, line 113, in forward
out = self.bn3(out)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/batchnorm.py”, line 106, in forward
exponential_average_factor, self.eps)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/functional.py”, line 1923, in batch_norm
training, momentum, eps, torch.backends.cudnn.enabled
RuntimeError: CUDA out of memory. Tried to allocate 1.10 GiB (GPU 0; 10.92 GiB total capacity; 9.94 GiB already allocated; 413.50 MiB free; 9.96 GiB reserved in total by PyTorch)
I am getting the above error whenever passing model.cuda() but when I remove it runs fine but on cpu.Unable to use gpu
Your model might be too big for your device. It’s a bit weird that you are seeing this error after calling model.cuda(), as this would mean that your model parameters are using more than 11GB.
Could this be the case?
If not, make sure that your GPU memory is free before running the script (check it via nvidia-smi).
This would mean that your single model is too large. You could count the number of parameters and buffers and calculate the theoretical size it would need on the GPU.
class Sequential(nn.Module):
def __init__(self,backend,seq_len,embed_size=512):
super(Sequential,self).__init__()
self.seq_len = seq_len
self.bk_name = backend
# defining backend
if self.bk_name == 'resnet':
resnet = models.resnet50(pretrained=True)
# self.dilate_resnet(resnet)
self.init_resnet(resnet)
self.encoder = nn.Linear(8192,embed_size,bias=True)
elif self.bk_name == 'vgg':
vgg = models.vgg19(pretrained=True)
self.init_vgg(vgg)
else:
assert 0, 'Backend not implemented'
self.rnn = LSTM(embed_size)
#print(self.rnn)
#print("feature extracted")
# self.rnn = G_LSTM(embed_size)
# self.rnn = GRU(embed_size)
self.decoder = nn.Linear(embed_size,1,bias=True)
# self.decoder_extra = nn.Linear(embed_size,1024)
# self.decoder = nn.Linear(1024,1,bias=True)
#fixting saliency module
for para in self.backend.parameters():
para.requires_grad = True # fixing pretrained modules or not
def init_resnet(self,resnet):
self.backend = nn.Sequential(*list(resnet.children())[:-2])
def init_vgg(self,vgg):
# self.backend = vgg.features
self.backend = nn.Sequential(*list(vgg.features.children())[:-1]) # omitting the last Max Pooling
def dilate_resnet(self, resnet): #modifying resnet as in SAM paper
resnet.layer3[0].conv1.stride = 1
resnet.layer3[0].downsample[0].stride = 1
resnet.layer4[0].conv1.stride = 1
resnet.layer4[0].downsample[0].stride = 1
for block in resnet.layer3:
block.conv2.dilation = 2
block.conv2.padding = 2
for block in resnet.layer4:
block.conv2.dilation = 4
block.conv2.padding = 4
def init_hidden(self,x): #initializing hidden state as all zero
h = x.data.new().resize_as_(x.data).fill_(0)
c = x.data.new().resize_as_(x.data).fill_(0)
return (Variable(h),Variable(c))
# return Variable(h) # return only h when using GRU
def forward(self,x):
batch, seq, c, h, w = x.size()
x = x.view(batch*seq,c,h,w)
# backend forward pass
if self.bk_name == 'resnet':
x = self.backend(x)
x = F.avg_pool2d(x,(7,7)) # Global Average Pooling
# x = F.max_pool2d(x,(7,7)) # Global Max Pooling
x = x.view(batch*seq,-1)
x = F.relu(self.encoder(F.dropout(x,p=0.4)))
else:
x = self.backend(x)
x = F.avg_pool2d(x,(14,14)) # Global Average Pooling for vgg19 without final maxpool
x = x.view(batch,seq,-1) # unroll features
# recurrent loop
state = self.init_hidden(x[:,0,:]) # initialize hidden state
for i in range(self.seq_len):
cur_x = x[:,i,:].contiguous()
state = self.rnn(cur_x,state)
h, c = state # LSTM
x = torch.sigmoid(self.decoder(F.dropout(h,p=0.5)))
return x
Are you seeing the OOM during the training or after calling model.cuda()?
From the first answers I understood that a simple model.cuda() is causing the OOM.
main()
File “t1.py”, line 179, in main
iteration = train(epoch,iteration)
File “t1.py”, line 135, in train
pred = model(img)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “./model/model.py”, line 155, in forward
x = self.backend(x)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/container.py”, line 100, in forward
input = module(input)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/module.py”, line 550, in call
result = self.forward(*input, **kwargs)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/modules/pooling.py”, line 141, in forward
self.return_indices)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/_jit_internal.py”, line 209, in fn
return if_false(*args, **kwargs)
File “/DATA/rani.1/.local/lib/python3.6/site-packages/torch/nn/functional.py”, line 539, in _max_pool2d
input, kernel_size, stride, padding, dilation, ceil_mode)
RuntimeError: CUDA out of memory. Tried to allocate 1.10 GiB (GPU 1; 10.92 GiB total capacity; 9.72 GiB already allocated; 637.50 MiB free; 9.74 GiB reserved in total by PyTorch)
I am getting this error even now after having batch size 1, any solution? @ptrblck
This error is raised during a forward pass, so the model.cuda() call will not cause the OOM as previously mentioned?
If so, please post a minimal, executable code snippet using random input tensors, so that we could debug it.
One more thing, it was running fine when I was sending about 12 images, together after concatenation, but when doing it for about 240 images. I m getting error again @ptrblck
You would have to use a batch size for the model and training, which fits into the memory of your device.
Too large batch sizes will try to use too much memory and will thus yield the “out of memory” issue.