I’m trying to train my model, but this error shows up.
RuntimeError: CUDA out of memory. Tried to allocate 1.14 GiB (GPU 1; 31.75 GiB total capacity; 3.73 GiB already allocated; 1.10 GiB free; 6.38 GiB reserved in total by PyTorch)
I tried to use loss += float(loss.item())
as the original document states here, but it didn’t work. The same error still shows up.
I also tried to decrease the batch size to 5, which I would like to take this number as the smallest and should not decrease to a smaller size.
This is my training/validation code, can someone comment on how to solve the issue. Many thanks!
def train(data_loader, model, optimizer, scheduler, total_epochs, save_interval, save_folder, sets):
# settings
batches_per_epoch = len(data_loader)
log.info('{} epochs in total, {} batches per epoch'.format(total_epochs, batches_per_epoch))
loss_f = nn.CrossEntropyLoss() #ignore_index=-1
print("Current setting is:")
print(sets)
print("\n\n")
if not sets.no_cuda:
loss_f = loss_f.cuda()
val_losses = []
val_acc = []
train_loss = []
train_acc = []
total_step = sets.batch_size
#model.train()
train_time_sp = time.time()
for epoch in range(total_epochs):
log.info('Start epoch {}'.format(epoch+1))
running_loss = 0.0
correct = 0
total = 0
batch_loss = 0
total_t = 0
correct_t = 0
for phase in ['train', 'val']:
if phase == 'train':
model.train()
elif phase == 'val':
print('Starting Validation: ')
z=0
for batch_id, batch_data in enumerate(data_loaders[phase]):
# getting data batch
z+=1
batch_id_sp = epoch * batches_per_epoch
volumes, labels = batch_data
if not sets.no_cuda:
volumes = volumes.cuda()
if phase == 'train':
optimizer.zero_grad()
output = model(volumes)
loss = loss_f(output, labels)
loss.backward()
optimizer.step()
avg_batch_time = (time.time() - train_time_sp) / (1 + batch_id_sp)
running_loss += loss.item()
_,pred = torch.max(output, dim=1)
correct += torch.sum(pred==labels).item()
total += output.size(0)
elif phase == 'val':
with torch.zero_grad():
model.eval()
output_t = model(volumes)
labels_t = torch.argmax(labels, dim=1).cuda()
val_loss = loss_f(output_t, labels_t)
batch_loss += val_oss.item()
_,pred_t = torch.max(output_t, dim=1)
correct_t += torch.sum(pred_t==labels_t).item()
total_t += target.size(0)
if False:
log.info(
'{} || Batch: {}-{} ({}), loss = {:.3f}, avg_batch_time = {:.3f}'\
.format(z, epoch, batch_id, batch_id_sp, loss.item(), avg_batch_time))
if True:
# save model
if batch_id ==batches_per_epoch-1 and epoch % 1 == 0:
model_save_path = '{}/trail1_{}.pth.tar'.format(save_folder, epoch)
model_save_dir = os.path.dirname(model_save_path)
if not os.path.exists(model_save_dir):
os.makedirs(model_save_dir)
log.info('Save checkpoints: epoch = {}, batch_id = {}'.format(epoch, batch_id))
torch.save({
'epoch': epoch,
'batch_id': batch_id,
'state_dict': model.state_dict(),
'optimizer': optimizer.state_dict()},
model_save_path)
train_acc.append(100*correct / total)
train_loss.append(running_loss / total_step)
val_acc.append(100*correct_t / total_t)
val_loss.append(batch_loss / total_step)
print(f"[{epcoh+1}] Train loss: {np.mean(train_loss):.3f}, Train acc: {(100*correct / total):.3f}")
print(f"Validation loss: {np.mean(val_loss):.3f}, Validation acc: {(100*correct_t / total_t):.3f}")
torch.cuda.empty_cache()
print('Finished training')