RuntimeError: CUDA out of memory error

Hello,

I’m training a VAE using a database of satellite images (256x256). Here is some of my code:

class VAE(nn.Module):
    def __init__(self, inp_s, conv_kernel_size1):
        super(VAE, self).__init__()
        self.n1=1
        self.n2=20
        self.n3=50
        self.padding=0
        self.stride=1
        self.size_out_conv1=int(((inp_s[2]-conv_kernel_size1-2*self.padding)/self.stride)+1)
        self.size_out_conv2=int(((self.size_out_conv1-conv_kernel_size1-2*self.padding)/self.stride)+1)
        self.fc1 = nn.Linear(self.size_out_conv2*self.size_out_conv2*self.n3, 700)
        self.fc21 = nn.Linear(700, 20)
        self.fc22 = nn.Linear(700, 20)
        self.fc3 = nn.Linear(20, 700)
        self.fc4 = nn.Linear(700, self.size_out_conv2*self.size_out_conv2*self.n3)
        self.conv1=nn.Conv2d(self.n1, self.n2, kernel_size=conv_kernel_size1)
        self.conv2=nn.Conv2d(self.n2, self.n3, kernel_size=conv_kernel_size1)
        self.deconv1=nn.ConvTranspose2d(self.n2, self.n1, kernel_size=conv_kernel_size1)
        self.deconv2=nn.ConvTranspose2d(self.n3, self.n2, kernel_size=conv_kernel_size1)

    def encode(self, x):
        x=F.relu(self.conv1(x))
        x=F.relu(self.conv2(x))
        s=x.shape        
        x=x.view(-1, s[1]*s[2]*s[3])
        h1 =F.relu(self.fc1(x))
        h1 =F.dropout(F.relu(self.fc1(x)), training=self.training)
        return self.fc21(h1), self.fc22(h1), s

    def reparameterize(self, mu, logvar):
        std = torch.exp(0.5*logvar)
        eps = torch.randn_like(std)
        return mu + eps*std

    def decode(self, z, s):       
        h3 = F.relu(self.fc3(z))
        x=F.relu(self.fc4(h3))
        x=x.view(s[0],s[1],s[2],s[3])        
        x=F.relu(self.deconv2(x))
        x=self.deconv1(x)
        return torch.sigmoid(x)

    def forward(self, x):
        mu, logvar, s = self.encode(x)
        z = self.reparameterize(mu, logvar)
        return self.decode(z, s), mu, logvar

def loss_function(recon_x, x, mu, logvar):
    MSE = F.mse_loss(recon_x, x, reduction="sum")   
    BCE = F.binary_cross_entropy(recon_x, x, reduction='sum')
    KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE +KLD, BCE, KLD, MSE

def train(train_dl, model, epoch, inp_im, out_im, lr1):  
    torch.backends.cudnn.benchmark = True
    optimizer=torch.optim.Adam(model.parameters(), lr=lr1)
    model.train()       
    train_loss=0.0
    bce_loss=0.0
    kld_loss=0.0
    mse_loss=0.0
    for idx, (data, label) in enumerate(train_dl):
        data, label= Variable(data), Variable(label)       
        data1=data[:,0,:,:]
        data1 = data1.unsqueeze(1)
        data1 = data1.to(device)
        out, mu, var=model(data1)
        loss,bce,kld,mse=loss_function(out,data1, mu, var)
        train_loss+=loss.item()
        bce_loss+=bce.item()
        kld_loss+=kld.item()
        mse_loss+=mse.item()
        model.zero_grad()
        loss.backward()
        optimizer.step()
    av_loss= train_loss / len(train_dl.dataset)     
    av_kld_loss=kld_loss / len(train_dl.dataset)
    av_bce_loss=bce_loss / len(train_dl.dataset)
    av_mse_loss=mse_loss / len(train_dl.dataset)
    return av_loss, av_bce_loss, av_kld_loss, av_mse_loss 

def test(epoch, model):
    model.eval()
    test_loss = 0
    test_mse_loss = 0
    with torch.no_grad():
        for i, (data, _) in enumerate(test_loader):
            data = data.to(device)           
            data1=data[:,0,:,:]
            data1 = data1.unsqueeze(1)
            data1 = data1.to(device)
            recon_batch, mu, logvar = model(data1)
            loss,bce,kld,mse= loss_function(recon_batch, data1, mu, logvar)            
            test_loss +=loss.item()
            test_mse_loss +=mse.item()
    test_loss /= len(test_loader.dataset)
    test_mse_loss /= len(test_loader.dataset)
    return test_loss, test_mse_loss

if __name__ == "__main__":    
    parser = argparse.ArgumentParser(description='VAE MNIST Example')
    parser.add_argument('--batch-size', type=int, default=128, metavar='N',
                    help='input batch size for training (default: 128)')
    parser.add_argument('-ks','--kernel-size', type=int, default=5, metavar='N',
                    help='kernel size (convolution)')
    parser.add_argument('--epochs', type=int, default=200, metavar='N',
                    help='number of epochs to train (default: 10)')
    parser.add_argument('-lr','--learning-rate', type=float, default=1e-3, metavar='',
                    help='learning rate (default: 1e-3)')
    args = parser.parse_args()
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    kwargs = {'num_workers': 1, 'pin_memory': True} if torch.cuda.is_available() else {}
    
    transform1=transforms.Compose([transforms.ToTensor()])
    train1=ImageFolder("/home/zaianir/Documents/dataset_s2/train", transform1)
    test1=ImageFolder("/home/zaianir/Documents/dataset_s2/test", transform1)
    train_loader=torch.utils.data.DataLoader(train1, batch_size=args.batch_size, shuffle=True, **kwargs)
    test_loader=torch.utils.data.DataLoader(test1, batch_size=args.batch_size, shuffle=False, **kwargs)

    data , target = next(iter(train_loader))
    inp_s=data.shape
    model = VAE(inp_s, args.kernel_size).to(device)
    for epoch in range(1, args.epochs + 1):
        moy_train_loss,av_bce_loss, av_kld_loss, av_mse_loss =train(train_loader, model, epoch, args.input_img, args.output_img, args.learning_rate)
        test_l, mse_test_l=test(epoch, model)
            
      
  

I’m getting this error:

load conda/4.5.4 : OK
Traceback (most recent call last):
File “/home/uz/zaianir/scratch/VAE22.py”, line 194, in
moy_train_loss,av_bce_loss, av_kld_loss, av_mse_loss =train(train_loader, model, epoch, args.input_img, args.output_img, args.learning_rate)
File “/home/uz/zaianir/scratch/VAE22.py”, line 96, in train
loss.backward()
File “/home/uz/zaianir/.conda/envs/myenv2/lib/python3.7/site-packages/torch/tensor.py”, line 102, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File “/home/uz/zaianir/.conda/envs/myenv2/lib/python3.7/site-packages/torch/autograd/init.py”, line 90, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: CUDA out of memory. Tried to allocate 7.64 GiB (GPU 0; 31.72 GiB total capacity; 20.76 GiB already allocated; 5.01 GiB free; 3.61 GiB cached)

  • I’m running my validation code with torch.no_grad()
  • I added torch.backends.cudnn.benchmark = True before training
  • I decreased the batch size from 128 to 20

But I still get the same error.
Can you help me please.
Thank you.

Hi,

You can remove data, label= Variable(data), Variable(label), this is not needed anymore.
Have you tried using a batch size of 1. It looks like your model is quite large.
Also you can run with anomaly mode enabled to see which part of the backward tries to allocate this 7GB Tensor.

Hello,

Thank you for you help.
Yes I tried using a batch size of 1 but I got the same error.
I think the problem comes from the fact that a patch of dimension 256x256 is too important so I used transforms.CenterCrop with a size of 32 and that solved the problem.

1 Like