CUDA out of memory - error

SurajSubramanian · February 24, 2020, 2:01pm

I am using a CycleGAN on the CelebA dataset

def train(G, F, Dg, Df, epochs = 10, batch_size = 32, D_real_loss = [], D_fake_loss = [], G_loss = [], F_loss = []):
    torch.cuda.empty_cache()
    for epoch in list(range(epochs)):
        print('Epoch number: {0}'.format(epoch))
        for batch,(rude_batch, smile_batch) in enumerate( zip(rude_loader, smile_loader) ):
            rude_real, smile_real = Variable(rude_batch[0]).type(dtype), Variable(smile_batch[0]).type(dtype)
            del rude_batch, smile_batch
            torch.cuda.empty_cache()
            # first half 
            smile_fake = G(rude_real)
            
            scores_real, scores_real_np = pass_through_discriminator(Dg, smile_real), Dgnp(smile_real)
            scores_fake, scores_fake_np = pass_through_discriminator(Dg, smile_fake), Dgnp(smile_fake)
            scores_real, scores_fake = 0.8 * scores_real + 0.2 * scores_real_np, 0.8 * scores_fake + 0.2 * scores_fake_np

            loss1,loss2 = torch.mean((scores_real - label_real)**2), torch.mean((scores_fake - label_fake)**2)
            Dg_optim.zero_grad()
            loss_dg = loss1 + loss2  
            del loss1, loss2
            torch.cuda.empty_cache()
            loss_dg.backward(retain_graph=True)
            Dg_optim.step()

            loss_g = torch.mean((scores_fake - label_real)**2) + 10 * torch.mean(torch.abs(G(F(smile_real)) - smile_real))
            G_optim.zero_grad()
            loss_g.backward(retain_graph=True)
            G_optim.step()
            
            # second half
            rude_fake = F(smile_real)

            scores_real, scores_real_np = pass_through_discriminator(Df, rude_real), Dfnp(rude_real)
            scores_fake, scores_fake_np = pass_through_discriminator(Df, rude_fake),  Dfnp(rude_fake)
            scores_real, scores_fake = 0.8 * scores_real + 0.2 * scores_real_np, 0.8 * scores_fake + 0.2 * scores_fake_np
           
            loss1,loss2 = torch.mean((scores_real - label_real)**2), torch.mean((scores_fake - label_fake)**2)
            Df_optim.zero_grad()
            loss_df = loss1 + loss2
            del loss1, loss2
            torch.cuda.empty_cache()
            loss_df.backward(retain_graph=True)
            Df_optim.step()

            loss_f = torch.mean((scores_fake - label_real)**2) + 10 * torch.mean(torch.abs(F(G(rude_real)) - rude_real))
            F_optim.zero_grad()
            loss_f.backward()
            F_optim.step()
            
            del smile_fake, smile_real, rude_fake, rude_real
            torch.cuda.empty_cache()
            
            if batch % 100 == 0:
                print('**Batch number: {0}**'.format(batch))
                print('Discriminator G loss: {0}'.format(loss_dg.data.item()))
                print('Generator G loss: {0}'.format(loss_g.data.item()))
                print('Discriminator F loss: {0}'.format(loss_df.data.item()))
                print('Generator F loss: {0}'.format(loss_f.data.item()))
            else:
                del loss_dg, loss_g, loss_df, loss_f
    
        G_scheduler.step()
        F_scheduler.step()
        Dg_scheduler.step()
        Df_scheduler.step()
        
        D_real_loss += [loss_f.data.item()]
        D_fake_loss += [loss_dg.data.item()]
        F_loss += [loss_f.data.item()]
        G_loss += [loss_g.data.item()]

    test_image(rude_real, G, F)
    saver(G.state_dict(), F.state_dict(), Dg.state_dict(), Df.state_dict())
    G,F,Dg,Df = loader(G,F,Dg,Df)
    plotter(D_real_loss, D_fake_loss, G_loss, F_loss)

But I get the following error when I run the code,

RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 11.00 GiB total capacity; 8.37 GiB already allocated; 6.86 MiB free; 8.42 GiB reserved in total by PyTorch)

I did delete variables that I no longer used and used torch.cuda.empty_cache()
Any suggestions as to how I can free memory would be really helpful, thanks in advance !

SurajSubramanian · February 24, 2020, 3:02pm

I reduced the batch_size to 8 and its working
But is there any other way to overcome this error!

albanD · February 24, 2020, 3:04pm

Unfortunately, 8GB is not a lot of GPU memory if you use large neural nets.
Reducing the batch size is the right way to go for this usually.