RuntimeError: CUDA out of memory in the second epoch

Siladittya_Manna · March 27, 2021, 2:58am

I am sharing a piece of my code where I am implementing SimCLR on a 16GB GPU. I am using a batch size of 64.

writer = SummaryWriter()
nr = 0
global_step = 0
current_epoch = 0
epochs = 30

def train(train_loader, model, criterion, optimizer, writer):
    loss_epoch = 0
    global global_step

    for step, (x_i, x_j) in enumerate(train_loader):
        optimizer.zero_grad()
        x_i = x_i.to('cuda:0')
        x_j = x_j.to('cuda:0')

        # positive pair, with encoding
        z_i = model(x_i)
        z_j = model(x_j)

        loss = criterion(z_i, z_j)
        loss.backward()

        optimizer.step()
    
        if nr == 0 and step % 10 == 0:
            print(f"Step [{step}/{len(train_loader)}]\t Loss: {round(loss.item(), 5)}")

        if nr == 0:
            writer.add_scalar("Loss/train_epoch", loss.item(), global_step)
            global_step += 1

        loss_epoch += loss.item()
    return loss_epoch

def valid(valid_loader, model, criterion, writer):
    loss_epoch = 0
    for step, (x_i, x_j) in enumerate(valid_loader):
    
        x_i = x_i.to('cuda:0')
        x_j = x_j.to('cuda:0')

        # positive pair, with encoding
        z_i = model(x_i)
        z_j = model(x_j)

        loss = criterion(z_i, z_j)
    
        if nr == 0 and step % 10 == 0:
            print(f"Step [{step}/{len(valid_loader)}]\t Loss: {round(loss.item(),5)}")

        loss_epoch += loss.item()
    return loss_epoch

for epoch in range(epochs):
    
    print(f"Epoch [{epoch}/{epochs}]\t")
    stime = time.time()
    
    lr = optimizer.param_groups[0]["lr"]
    
    model.train()
    tr_loss_epoch = train(dl, model, criterion, optimizer, writer)

    if nr == 0 and scheduler:
        scheduler.step()

    if nr == 0 and epoch % 10 == 0:
        save_model(model, optimizer, current_epoch)
    
    model.eval()
    val_loss_epoch = valid(vdl, model, criterion, writer)
    
    if nr == 0:
        writer.add_scalar("Loss/train", tr_loss_epoch / len(dl), epoch)
        writer.add_scalar("Loss/valid", val_loss_epoch / len(vdl), epoch)
        writer.add_scalar("Misc/learning_rate", lr, epoch)
        print(
            f"Epoch [{epoch}/{epochs}]\t Training Loss: {tr_loss_epoch / len(dl)}\t lr: {round(lr, 5)}"
        )
        print(
            f"Epoch [{epoch}/{epochs}]\t Validation Loss: {val_loss_epoch / len(vdl)}\t lr: {round(lr, 5)}"
        )
        current_epoch += 1
        
    dg.on_epoch_end()
    
    time_taken = (time.time()-stime)/60
    print(f"Epoch [{epoch}/{epochs}]\t Time Taken: {time_taken} minutes")

## end training
save_model(model, optimizer, current_epoch)

The issue I am facing with this code is that the first epoch is running fine, but after the first is done and the code proceeds into the second epoch, I am getting this error

RuntimeError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 0; 15.90 GiB total capacity; 14.73 GiB already allocated; 27.06 MiB free; 14.85 GiB reserved in total by PyTorch)

Why am I getting this error in the second epoch if the first epoch is running without any error?

InnovArul · March 27, 2021, 3:33am

Can you try enclosing the above code under with torch.no_grad():?

model.eval()
with torch.no_grad():
    val_loss_epoch = valid(vdl, model, criterion, writer)

Siladittya_Manna · March 27, 2021, 3:48am

Thanks! It worked!!

I was ignorant of the fact that using “model.eval()” does not turn off gradient computation! It just acts as a switch for layers like BN.

Using “torch.no_grad()” prevents computation of gradients in the validation step.