Cuda runtime error (2) : out of memory

Hi there.
I am so new in Pytorch. Here is My code to implement a GAN architecture to generate some Images. I have implement it based on dcgan example in PyTorch github repository. when I’ve ran my code on my 2 Geforce GTX 1080 and also with 64 GB Ram, I’ve faced with following error (by the way my cuda version is 7.5):

RuntimeError Traceback (most recent call last)
in ()
7 label.data.resize_(batchSize).fill_(real_label)
8
----> 9 output = netD(input)
10 errD_real = criterion(output, label)
11 errD_real.backward()

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in call(self, *input, **kwargs)
200
201 def call(self, *input, **kwargs):
–> 202 result = self.forward(*input, **kwargs)
203 for hook in self._forward_hooks.values():
204 hook_result = hook(self, input, result)

in forward(self, input)
32 gpu_ids = range(self.ngpu)
33
—> 34 output = nn.parallel.data_parallel(self.main, input, gpu_ids)
35 return output.view(-1,1)

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in data_parallel(module, input, device_ids, output_device)
90 inputs = scatter(input, device_ids)
91 replicas = replicas[:len(inputs)]
—> 92 outputs = parallel_apply(replicas, inputs)
93 return gather(outputs, output_device)

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs)
43 output = results[i]
44 if isinstance(output, Exception):
—> 45 raise output
46 outputs.append(output)
47 return outputs

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py in _worker(module, input, results, lock)
24 try:
25 with torch.cuda.device_of(var_input):
—> 26 output = module(input)
27 with lock:
28 results[input] = output

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in call(self, *input, **kwargs)
200
201 def call(self, *input, **kwargs):
–> 202 result = self.forward(*input, **kwargs)
203 for hook in self._forward_hooks.values():
204 hook_result = hook(self, input, result)

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/modules/container.py in forward(self, input)
62 def forward(self, input):
63 for module in self._modules.values():
—> 64 input = module(input)
65 return input
66

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/modules/module.py in call(self, *input, **kwargs)
200
201 def call(self, *input, **kwargs):
–> 202 result = self.forward(*input, **kwargs)
203 for hook in self._forward_hooks.values():
204 hook_result = hook(self, input, result)

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/modules/conv.py in forward(self, input, output_size)
521 return F.conv_transpose2d(
522 input, self.weight, self.bias, self.stride, self.padding,
–> 523 output_padding, self.groups)
524
525

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/functional.py in conv_transpose2d(input, weight, bias, stride, padding, output_padding, groups)
116 f = ConvNd(_pair(stride), _pair(padding), _pair(1), True,
117 _pair(output_padding), groups)
–> 118 return f(input, weight, bias) if bias is not None else f(input, weight)
119
120

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/_functions/conv.py in forward(self, input, weight, bias)
33 if k == 3:
34 input, weight = _view4d(input, weight)
—> 35 output = self._update_output(input, weight, bias)
36 if k == 3:
37 output, = _view3d(output)

/home/mlcmdeep/anaconda3/lib/python3.6/site-packages/torch/nn/_functions/conv.py in _update_output(self, input, weight, bias)
82 self.use_cudnn = not self.is_dilated()
83 if self.use_cudnn:
—> 84 output = input.new(*self._output_size(input, weight))
85 if self.transposed:
86 self._cudnn_info = (

RuntimeError: cuda runtime error (2) : out of memory at /data/users/soumith/miniconda2/conda-bld/pytorch-0.1.9_1487346124464/work/torch/lib/THC/generic/THCStorage.cu:66

I would like to know how can I solve my problem. What is the source of problem?

My Code:

import torch
import torch.nn as nn
import torchvision
import torchvision.datasets as dset
import torchvision.transforms as trans
from torch.autograd import Variable as V
import torchvision.utils as vutils
import torch.optim as optim

import PyQt5
import matplotlib
matplotlib.use(‘Qt5Agg’)
import matplotlib.pyplot as plt
import numpy as np

def imshow(img):
img = img/2 + 0.5
npimg = img.numpy()
plt.figure(1)
plt.imshow(np.transpose(npimg, (1,2,0)))
plt.show()

ngpu = 2 # Number of GPU
nz = 100 # Number of Latent Code
ngf = 64 # Number of Generator Feature Map
ndf = 64 # Number of Discriminator Feature Map
nc = 3 # Number of Channel For each Image
batchSize = 64
imageWidth = 256
imageHeight = 256
cuda = 1
learning_rate = 0.0002
beta1 = 0.5
beta2 = 0.99
niter = 25
outf = ‘./data’

data_root = ‘./Genuine/’

dataset = dset.ImageFolder(root=data_root,
transform=trans.Compose([trans.Scale(256),
trans.CenterCrop(256),
trans.ToTensor(),
trans.Normalize((.5,.5,.5),(.5,.5,.5))])
)
assert dataset

dataLoader = torch.utils.data.DataLoader(dataset = dataset,
batch_size=batchSize,
shuffle=True,
num_workers = 2)

class _netG(nn.Module):
def init(self, ngpu):
super(_netG, self).init()
self.ngpu = ngpu
self.main = nn.Sequential(
nn.ConvTranspose2d(nz, ngf32, 4, 1, 0, bias=True),
nn.BatchNorm2d(32
ngf),
nn.ReLU(True), # size( (32ngf)44 )
nn.ConvTranspose2d(ngf
32, ngf20, 4, 2, 1, bias=True),
nn.BatchNorm2d(20
ngf),
nn.ReLU(True), # size( (20ngf)88 )
nn.ConvTranspose2d(ngf
20, ngf16, 4, 2, 1, bias=True),
nn.BatchNorm2d(16
ngf),
nn.ReLU(True), # size( (16ngf)1616 )
nn.ConvTranspose2d(ngf
16, ngf16, 4, 2, 1, bias=True),
nn.BatchNorm2d(16
ngf),
nn.ReLU(True), # size( (16ngf)3232 )
nn.ConvTranspose2d(ngf
32, ngf16, 4, 2, 1, bias=True),
nn.BatchNorm2d(16
ngf),
nn.ReLU(True), # size( (16ngf)6464 )
nn.ConvTranspose2d(ngf
16, ngf8, 4, 2, 1, bias=True),
nn.BatchNorm2d(8
ngf),
nn.ReLU(True), # size( (8ngf)128128 )
nn.ConvTranspose2d(ngf
8, ngf3, 4, 2, 1, bias=True),
nn.BatchNorm2d(8
ngf),
nn.ReLU(True), # size( (3*ngf)256256 )
)
def forward(self, input):
gpu_ids = None
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu>1:
gpu_ids = range(self.ngpu)

  output = nn.parallel.data_parallel(self.main, input, gpu_ids)
  return output

class _netD(nn.Module):
def init(self, ngpu):
super(_netD, self).init()
self.ngpu = ngpu
self.main = nn.Sequential(
nn.ConvTranspose2d(nz, ngf32, 4, 1, 0, bias=True),
nn.BatchNorm2d(32
ngf),
nn.ReLU(True), # size( (32ngf)44 )
nn.ConvTranspose2d(ngf
32, ngf16, 4, 2, 1, bias=True),
nn.BatchNorm2d(16
ngf),
nn.ReLU(True), # size( (16ngf)88 )
nn.ConvTranspose2d(ngf
16, ngf12, 4, 2, 1, bias=True),
nn.BatchNorm2d(12
ngf),
nn.ReLU(True), # size( (12ngf)1616 )
nn.ConvTranspose2d(ngf
12, ngf10, 4, 2, 1, bias=True),
nn.BatchNorm2d(10
ngf),
nn.ReLU(True), # size( (10ngf)3232 )
nn.ConvTranspose2d(ngf
10, ngf8, 4, 2, 1, bias=True),
nn.BatchNorm2d(8
ngf),
nn.ReLU(True), # size( (8ngf)6464 )
nn.ConvTranspose2d(ngf
8 ,ngf6 ,4, 2, 1, bias=True),
nn.BatchNorm2d(6
ngf),
nn.ReLU(True), # size( (6ngf)128128 )
nn.ConvTranspose2d(ngf
6, 3, 4, 2, 1, bias=True),
nn.BatchNorm2d(3),
nn.ReLU(True), # size( (3*ngf)256256 )
)

def forward(self, input):
gpu_ids = None
if isinstance(input.data, torch.cuda.FloatTensor) and self.ngpu>1:
gpu_ids = range(self.ngpu)

  output = nn.parallel.data_parallel(self.main, input, gpu_ids)
  return output.view(-1,1)

def weights_init(m):
classname = m.class.name
if classname.find(‘Conv’) != (-1):
m.weight.data.normal_(0.0, 0.02)
elif classname.find(‘BatchNorm’) != (-1):
m.weight.data.normal_(1,0.02)
m.bias.data.fill_(0)

netG = _netG(ngpu)
netG.apply(weights_init)
print(netG)

netD = _netD(ngpu)
netD.apply(weights_init)
print(netD)

criterion = nn.BCELoss() # Binary Cross Entropy Between Target and Output
input = torch.FloatTensor(batchSize, nc, imageWidth, imageHeight)
noise = torch.FloatTensor(batchSize, nz, 1, 1)
fixed_noise = torch.FloatTensor(batchSize, nz, 1, 1).normal_(0,1)
label = torch.FloatTensor(batchSize)
real_label, fake_label = 1, 0

if cuda:
netG.cuda()
netD.cuda()
criterion.cuda()
input, label = input.cuda(), label.cuda()
noise, fixed_noise = noise.cuda(), label.cuda()

input = V(input)
label = V(label)
noise = V(noise)
fixed_noise = V(fixed_noise)

optimizerD = optim.Adam(netD.parameters(), lr= learning_rate, betas = (beta1, beta2))
optimizerG = optim.Adam(netG.parameters(), lr= learning_rate, betas = (beta1, beta2))

for epoch in range(niter):
for i, data in enumerate(dataLoader, 0):

  netD.zero_grad()
  real_cpu, _ = data
  batchSize = real_cpu.size(0)
  input.data.resize_(real_cpu.size()).copy_(real_cpu)
  label.data.resize_(batchSize).fill_(real_label)
  output = netD(input)
  errD_real = criterion(output, label)
  errD_real.backward()
  D_x = output.mean()
  noise.data.resize_(batchSize, nz, 1, 1)
  noise.data.normal_(0,1)
  fake = netG(noise).detach() # detach() blocks the gradient
  label.data.fill_(fake_label)
  output = netD(fake)
  errD_fake = criterion(output, label)
  errD_fake.backward()
  D_G_z1 = output.data.mean()
  errD = errD_fake + errD_real
  optimizerD.step()
  netG.zero_grad()
  label.data.fill_(real_label)
  noise.data.normal_(0,1)
  fake = netG(noise)
  output = netD(fake)
  errG = criterion(output, label)
  errG.backward()
  D_G_z2 = output.data.mean()
  optimizerG.step()
  print('[%d/%d][%d/%d] Loss-D: %.4f loss-G: %.4f D(x): %.4f D(G(z)): %.4f / %.4f' % (epoch,
      niter,
      i,
      len(dataLoader), 
      errD.data[0],
  	errG.data[0], 
  	D_x, 
  	D_G_z1, 
  	D_G_z2))
  if i%100 == 0:
  	vutils.save_image(real_cpu, '%s/real_sample.png' % (outf))
  	fake = netG(fixed_noise)
  	vutils.save_image(fake.data, '%s/fake_samples_%0.3d.png' % (outf, epoch))

do checkpointing

torch.save(netG.state_dict(), '%s/netG_epoch_%d.pth' % (outf, epoch))
torch.save(netD.state_dict(), '%s/netD_epoch_%d.pth' % (outf, epoch))

The problem is exactly what the error says, you ran out of memory on the GPU. You can try reducing the batch size or image sizes.

1 Like

Try nvidia-smi. It would show you the memory usage

To complement apaszke’s answer, in case you are using Jupyter and you ‘run’ multiple times the cell which creates the network, put always a delete net on that cell, otherwise you might be creating multiple neural networks which will cause similar problems.

If not, just try to find the maximum batch size which works for your hardware.

1 Like

I analysed the memory consumption using nvidia-smi and found out that the code constantly consumes memory per iteration. I am using the DCGAN code adapted from pytorch examples.

I analysed the memory consumption using nvidia-smi and found out that the code constantly consumes memory per iteration. I am using the DCGAN code adapted from pytorch examples

I face the same problem with you and I find the problem results from the upsampling layer.

When I use nvidia-smi to monitor my GPU memory usage, I find that during some time, the memory requirement for transpose2d for upsampling will be doubled and it tells me out of memory…

Maybe consider to reduce the batch size.

1 Like