Hi everyone,
It is very common to see in the examples and tutorial this scheme (taken from tutorial: “How to train a classifier”):
for epoch in range(2): # loop over the dataset multiple times
running_loss = 0.0
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
# print statistics
running_loss += loss.item()
if i % 2000 == 1999: # print every 2000 mini-batches
print('[%d, %5d] loss: %.3f' %
(epoch + 1, i + 1, running_loss / 2000))
running_loss = 0.0
print('Finished Training')
However, I don’t understand why the loss and the accuracy are restarted every epoch.
Don’t we want to see how it has been evolving during the entire training process? Is there any reason to do it?
This is my attemp, initializing all the control variables at the beggining:
def train_baseline(epochs, trainset, validset, model, criterion, optimizer,
log_file=None, save_frequency=1, validate=False):
'''
Function to trained a pre-trained model
'''
print('Starting training time...')
if log_file: f = open(log_file, 'w+')
train_acc = []
valid_acc = []
train_loss = []
valid_loss = []
train_total = 0
valid_total = 0
train_correct = 0
valid_correct = 0
start = now()
for epoch in range(1, epochs+1):
for i, (images, labels) in enumerate(trainset):
i += 1
# Forwad Pass
images = Variable(images)
labels = Variable(torch.tensor(labels)) # Why doesn't convert it manually?
model.zero_grad()
outputs = model(images)
# Compute Loss and Accuracy
loss = criterion(outputs, labels)
train_loss.append(round(loss.item(), 2))
scores, predictions = torch.max(outputs.data, 1)
train_total += labels.size(0)
train_correct += int(sum(predictions == labels)) # labels.size(0) returns int
acc = round((train_correct / train_total) / 100, 2)
train_acc.append(acc)
# Backpropagation
loss.backward()
optimizer.step()
# Get training statistics.
stats = 'Epoch [{}/{}], Step [{}], Loss: {}, Accuracy: {}'.format(epoch, epochs, i, loss.item(), train_acc)
print('\n' + stats)
f.write(stats + '\n')
f.flush
# Validation step
if validate:
# validate is an Fasle or an int -> after how many iteration we validate
if i % validate == 0 and i > 0:
print('Entering in validation...')
for j, (images, labels) in enumerate(validset):
images = Variable(images)
labels = Variable(labels)
outputs = model(images)
loss = criterion(outputs, labels)
valid_loss.append(round(loss.item(), 2))
score, predictions = torch.max(outputs.data, 1)
valid_total += labels.size(0)
valid_correct += (predictions == labels).sum()
acc = round((valid_correct / valid_total) / 100, 2)
valid_acc.append(acc)
# print('Iteration: {}. Loss: {}. Accuracy: {}'.format(i, loss, acc))
if i > 1:
break
if epoch > 1:
break
if epoch % save_frequency == 0:
torch.save(model.state_dict(), os.path.join('./models', '%s-%d.pkl' % (model.name, epoch)))
print('Time: {} hours {} minutes'.format(time(start)[0], time(start)[1]))
if log_file: f.close()
train_history = {'loss': train_loss, 'accuracy': train_acc}
valid_history = {'loss': valid_loss, 'accuracy': valid_acc}
return train_history, valid_history
What I am missing?
Thanks in advance.
Regards,
Pablo