How does one track the train AND test error during training?

I wanted to track the test and train error during training. I did the following code but it doesn’t quite work the way I think is the correct semantics:

def train_and_track_stats(args, nb_epochs, trainloader,testloader, net,optimizer,criterion,error_criterion):
    enable_cuda = args.enable_cuda
    ##
    for epoch in range(nb_epochs):  # loop over the dataset multiple times
        running_train_loss,running_train_error = 0.0,0.0
        running_test_loss,running_test_error = 0.0,0.0
        for (i,(data_train,data_test)) in enumerate( zip(trainloader,testloader) ):
            ''' zero the parameter gradients '''
            optimizer.zero_grad()
            ''' train step = forward + backward + optimize '''
            inputs, labels = extract_data(enable_cuda,data_train,wrap_in_variable=True)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.data[0]
            running_train_error += error_criterion(outputs,labels)
            ''' test evaluation '''
            inputs, labels = extract_data(enable_cuda,data=data_test,wrap_in_variable=True)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            running_test_loss += loss.data[0]
            running_test_error += error_criterion(outputs,labels)
            ''' print error first iteration'''
            if i == 0: # print on the first iteration
                print(f'--\ni={i}, running_train_loss={running_train_loss}, running_train_error={running_train_error}, running_test_loss={running_test_loss},running_test_error={running_test_error}')
        ''' End of Epoch: collect stats'''
        train_loss_epoch, train_error_epoch = running_train_loss/(i+1), running_train_error/(i+1)
        test_loss_epoch, test_error_epoch = running_test_loss/(i+1), running_test_error/(i+1)
        print(f'epoch={epoch}, train_loss_epoch={train_loss_epoch}, train_error_epoch={train_error_epoch}, test_loss_epoch={test_loss_epoch},test_error_epoch={test_error_epoch}')
        stats_collector.append_losses_errors(train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epoch)
    return train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epoch

def extract_data(enable_cuda,data,wrap_in_variable=False):
    inputs, labels = data
    if enable_cuda:
        inputs, labels = inputs.cuda(), labels.cuda()
    if wrap_in_variable:
        inputs, labels = Variable(inputs), Variable(labels)
    return inputs, labels

what is the standard way that this is done in Pytorch? Does it make a difference if we are using cifar10 or imagenet?

Seems fine to me. What do you think is wrong with it?

I just noticed that zip goes through the length of the smallest data set so it’s not training on the whole training set.

I think if I want tot rack test error its best to just compute it at the end of each epoch.

I think this should work perhaps:

def train_and_track_stats2(args, nb_epochs, trainloader,testloader, net,optimizer,criterion,error_criterion ,stats_collector):
    enable_cuda = args.enable_cuda
    ##
    print('about to start training')
    for epoch in range(nb_epochs):  # loop over the dataset multiple times
        running_train_loss,running_train_error = 0.0,0.0
        for i,data_train in enumerate(trainloader):
            ''' zero the parameter gradients '''
            optimizer.zero_grad()
            ''' train step = forward + backward + optimize '''
            inputs, labels = extract_data(enable_cuda,data_train,wrap_in_variable=True)
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            running_train_loss += loss.data[0]
            running_train_error += error_criterion(outputs,labels)
            # ''' print error first iteration'''
            # if i == 0: # print on the first iteration
            #     print(f'--\ni={i}, running_train_loss={running_train_loss}, running_train_error={running_train_error}')
        ''' End of Epoch: collect stats'''
        train_loss_epoch, train_error_epoch = running_train_loss/(i+1), running_train_error/(i+1)
        test_loss_epoch, test_error_epoch = evalaute_mdl_data_set(criterion,error_criterion,net,testloader,enable_cuda)
        stats_collector.collect_mdl_params_stats(net)
        stats_collector.append_losses_errors(train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epoch)
        print(f'epoch={epoch}, train_loss_epoch={train_loss_epoch}, train_error_epoch={train_error_epoch}, test_loss_epoch={test_loss_epoch},test_error_epoch={test_error_epoch}')
    return train_loss_epoch, train_error_epoch, test_loss_epoch, test_error_epoch

what seems wrong to me is that cifar10 ran in 5 minutes with my second code (my gpu is a Tesla K80). So I am assuming I must be doing something wrong…

if there was any error issues it was probably how the errors were being tracked. Use the following:

For the fix check: