Hello,
I usually calculate training and validation loss in the following way but I am not sure if I am logging the validation loss and training loss in the correct way.
data_len = len(train_loader.dataset)
batch_losses = [] #Training losses per batch
train_e_losses = [] #Training losses per epoch
#train_loss = []
val_e_losses = []
#running_loss = 0.0
#val_running_loss = 0.0
model.train()
for e in range(n_epochs):
t0 = time.time()
#print(batch_size)
#batch loop
for batch_idx, _data in enumerate(train_loader, 1):
specs, labels = _data
specs, labels = specs.to(device), labels.to(device)
# Break if it is the last batch or the length is not the same as batch size because otherwise it will get error.
n_batches = data_len//batch_size
#print(n_batches)
if (batch_idx > n_batches):
#print(batch_idx)
break
# zero accumulated gradients
model.zero_grad()
# get the output from the model
output = model(specs)
# calculate the loss and perform backprop
loss = criterion(output, labels.float())
loss.backward()
optimizer.step()
scheduler.step()
# loss stats
batch_losses.append(loss.item())
if batch_idx % show_every_n_batch == 0:
# Get Validation loss
val_losses = []
model.eval()
for batch_idx_v, _data in enumerate(valid_loader, 1):
specs, labels = _data
specs, labels = specs.to(device), labels.to(device)
# Break if it is the last batch or the length is not the same as batch size because otherwise it will get error.
n_batches_v = len(valid_loader)//batch_size
if (batch_idx_v > n_batches_v):
#print(batch_idx)
break
with torch.no_grad():
output, val_h = model(specs, val_h)
val_loss = criterion(output, labels.float())
#print(val_loss)
val_losses.append(val_loss.item())
model.train()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tTraining Loss: {:.6f}\tValidation Loss: {:.6f}'.format(
e+1, batch_idx * len(specs), data_len,
100. * batch_idx / len(train_loader), np.average(batch_losses), np.average(val_losses)))
val_e_losses.append(np.average(val_losses))
train_e_losses.append(np.average(batch_losses))
batch_losses = []
#utils.save_checkpoint(save_path=save_model_path, model=model, optimizer=optimizer, epoch=e, loss=train_e_losses)
print('Epoch {} took {} seconds'.format(e+1, time.time() - t0))
return (model, train_e_losses,val_e_losses)