I am sharing a piece of my code where I am implementing SimCLR on a 16GB GPU. I am using a batch size of 64.
writer = SummaryWriter()
nr = 0
global_step = 0
current_epoch = 0
epochs = 30
def train(train_loader, model, criterion, optimizer, writer):
loss_epoch = 0
global global_step
for step, (x_i, x_j) in enumerate(train_loader):
optimizer.zero_grad()
x_i = x_i.to('cuda:0')
x_j = x_j.to('cuda:0')
# positive pair, with encoding
z_i = model(x_i)
z_j = model(x_j)
loss = criterion(z_i, z_j)
loss.backward()
optimizer.step()
if nr == 0 and step % 10 == 0:
print(f"Step [{step}/{len(train_loader)}]\t Loss: {round(loss.item(), 5)}")
if nr == 0:
writer.add_scalar("Loss/train_epoch", loss.item(), global_step)
global_step += 1
loss_epoch += loss.item()
return loss_epoch
def valid(valid_loader, model, criterion, writer):
loss_epoch = 0
for step, (x_i, x_j) in enumerate(valid_loader):
x_i = x_i.to('cuda:0')
x_j = x_j.to('cuda:0')
# positive pair, with encoding
z_i = model(x_i)
z_j = model(x_j)
loss = criterion(z_i, z_j)
if nr == 0 and step % 10 == 0:
print(f"Step [{step}/{len(valid_loader)}]\t Loss: {round(loss.item(),5)}")
loss_epoch += loss.item()
return loss_epoch
for epoch in range(epochs):
print(f"Epoch [{epoch}/{epochs}]\t")
stime = time.time()
lr = optimizer.param_groups[0]["lr"]
model.train()
tr_loss_epoch = train(dl, model, criterion, optimizer, writer)
if nr == 0 and scheduler:
scheduler.step()
if nr == 0 and epoch % 10 == 0:
save_model(model, optimizer, current_epoch)
model.eval()
val_loss_epoch = valid(vdl, model, criterion, writer)
if nr == 0:
writer.add_scalar("Loss/train", tr_loss_epoch / len(dl), epoch)
writer.add_scalar("Loss/valid", val_loss_epoch / len(vdl), epoch)
writer.add_scalar("Misc/learning_rate", lr, epoch)
print(
f"Epoch [{epoch}/{epochs}]\t Training Loss: {tr_loss_epoch / len(dl)}\t lr: {round(lr, 5)}"
)
print(
f"Epoch [{epoch}/{epochs}]\t Validation Loss: {val_loss_epoch / len(vdl)}\t lr: {round(lr, 5)}"
)
current_epoch += 1
dg.on_epoch_end()
time_taken = (time.time()-stime)/60
print(f"Epoch [{epoch}/{epochs}]\t Time Taken: {time_taken} minutes")
## end training
save_model(model, optimizer, current_epoch)
The issue I am facing with this code is that the first epoch is running fine, but after the first is done and the code proceeds into the second epoch, I am getting this error
RuntimeError: CUDA out of memory. Tried to allocate 50.00 MiB (GPU 0; 15.90 GiB total capacity; 14.73 GiB already allocated; 27.06 MiB free; 14.85 GiB reserved in total by PyTorch)
Why am I getting this error in the second epoch if the first epoch is running without any error?