K Fold Cross validation with Optuna

Hi everyone,
I hope you are doing well.

I want to use Optuna for hyperparameter optimization of my CNN model. Also, I want to apply five-fold cross-validation for the test/train splitting phase. The solution posted by Toshihiko Yanase here, is as follows:

# This example is copied from https://github.com/optuna/optuna/blob/master/examples/pytorch_simple.py.

DEVICE = torch.device("cpu")
BATCHSIZE = 128
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
LOG_INTERVAL = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10


def define_model(trial):
    # We optimize the number of layers, hidden untis and dropout ratio in each layer.
    n_layers = trial.suggest_int("n_layers", 1, 3)
    layers = []

    in_features = 28 * 28
    for i in range(n_layers):
        out_features = trial.suggest_int("n_units_l{}".format(i), 4, 128)
        layers.append(nn.Linear(in_features, out_features))
        layers.append(nn.ReLU())
        p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
        layers.append(nn.Dropout(p))

        in_features = out_features
    layers.append(nn.Linear(in_features, CLASSES))
    layers.append(nn.LogSoftmax(dim=1))

    return nn.Sequential(*layers)


def objective(trial, train_loader, valid_loader):

    # Generate the model.
    model = define_model(trial).to(DEVICE)

    # Generate the optimizers.
    optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
    lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
    optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)

    # Training of the model.
    model.train()
    for epoch in range(EPOCHS):
        for batch_idx, (data, target) in enumerate(train_loader):
            # Limiting training data for faster epochs.
            if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
                break

            data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)

            optimizer.zero_grad()
            output = model(data)
            loss = F.nll_loss(output, target)
            loss.backward()
            optimizer.step()

        # Validation of the model.
        model.eval()
        correct = 0
        with torch.no_grad():
            for batch_idx, (data, target) in enumerate(valid_loader):
                # Limiting validation data.
                if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
                    break
                data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
                output = model(data)
                # Get the index of the max log-probability.
                pred = output.argmax(dim=1, keepdim=True)
                correct += pred.eq(target.view_as(pred)).sum().item()

        accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)

    return accuracy

def objective_cv(trial):

    # Get the MNIST dataset.
    dataset = datasets.MNIST(DIR, train=True, download=True, transform=transforms.ToTensor())

    fold = KFold(n_splits=3, shuffle=True, random_state=0)
    scores = []
    for fold_idx, (train_idx, valid_idx) in enumerate(fold.split(range(len(dataset)))):
        train_data = torch.utils.data.Subset(dataset, train_idx)
        valid_data = torch.utils.data.Subset(dataset, valid_idx)

        train_loader = torch.utils.data.DataLoader(
            train_data,
            batch_size=BATCHSIZE,
            shuffle=True,
        )
        valid_loader = torch.utils.data.DataLoader(
            valid_data,
            batch_size=BATCHSIZE,
            shuffle=True,
        )

        accuracy = objective(trial, train_loader, valid_loader)
        scores.append(accuracy)
    return np.mean(scores)

study = optuna.create_study(direction="maximize")
study.optimize(objective_cv, n_trials=20, timeout=600)

pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]

print("Study statistics: ")
print("  Number of finished trials: ", len(study.trials))
print("  Number of pruned trials: ", len(pruned_trials))
print("  Number of complete trials: ", len(complete_trials))

print("Best trial:")
trial = study.best_trial

print("  Value: ", trial.value)

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

As shown in this code snippet, two objective functions are defined, one for each fold and one for all the five folds; the trials made by the Optuna will try to enhance the mean accuracy achieved by all the five folds (objective_cv), not a single one (objective). That is very interesting; however, I want to have the same hyperparameter for all folds; for instance, I want to have the same optimizer, the same learning rate, and the same batch size for all folds, which seems more reasonable and is more convenient for reporting.

I tried this code snippet:

def train_alexnet (model, device, train_dataloader, test_dataloader, num_epochs, loss_fn, optimizer, trial):
             

  train_losses = []
  test_losses = []
  train_accs = []
  test_accs = []


  for epoch in range (num_epochs):

    model.train ()

    train_loss = 0.0

    num_correct = 0

    for image, label in train_dataloader:
      
      label = label.float ()

      output = model (image)

      loss = loss_fn (output, label.view (-1, 1))

      train_loss +=  loss.item () * image.size (0)

      optimizer.zero_grad ()

      loss.backward ()

      optimizer.step ()

      pred = output > 0

      num_correct += torch.sum (pred == label.view (-1, 1))

    train_loss_epoch = train_loss / len (train_dataloader.dataset)

    train_losses.append (train_loss_epoch)

    train_acc_epoch = num_correct.double () / len (train_dataloader.dataset)

    train_accs.append (train_acc_epoch.item ())

    #Validation phase

    model.eval ()

    test_loss = 0.0

    num_correct = 0

    with torch.no_grad ():
      for image, label in test_dataloader:
      
        label = label.float ()

        output = model (image)
        loss = loss_fn (output, label.view (-1, 1))

        test_loss += loss.item () * image.size (0)

        pred = output > 0

        num_correct += torch.sum (pred == label.view (-1, 1))

    test_loss_epoch = test_loss / len (test_dataloader.dataset)

    test_losses.append (test_loss_epoch)

    test_acc_epoch = num_correct.double () / len (test_dataloader.dataset)

    test_accs.append (test_acc_epoch.item ())

    trial.report(test_acc_epoch, epoch)

    # Handle pruning based on the intermediate value.
    if trial.should_prune():
      raise optuna.TrialPruned()

  return test_acc_epoch

#####################################
#this function moves my whole dataset to the GPU before starting the training phase

def gpu_dataset_generator (train_dataset, test_dataset, device):

  train_dataset_list = []

  test_dataset_list = []

  train_dataloader_cpu = data.DataLoader (train_dataset, batch_size = 64, pin_memory = True)

  test_dataloader_cpu = data.DataLoader (test_dataset, batch_size = 64, pin_memory= True)

  #First move the train data set

  for image, label in train_dataloader_cpu:

    image, label = image.to (device), label.to (device)

    train_dataset_list.append (data.TensorDataset (image, label))

  #Then move the test dataset

  for image, label in test_dataloader_cpu:

    image, label = image.to (device), label.to (device)

    test_dataset_list.append (data.TensorDataset (image, label))

  train_dataset_gpu = data.ConcatDataset ((train_dataset_list))

  test_dataset_gpu = data.ConcatDataset ((test_dataset_list))

  return train_dataset_gpu, test_dataset_gpu

###############################

def objective (trial):


  n_folds = 5

  skfg = StratifiedGroupKFold (n_splits = n_folds, shuffle = True, random_state = 42)

  labels_total = csv_data.iloc [:, 0]

  groups_total = csv_data.iloc [:, 2]

  acc_folds = 0

  lr = trial.suggest_loguniform ('lr', 1e-5, 1e-3)
  dropout_probab = trial.suggest_float ('dropout_probab', 0.5, 0.9, step = 0.1)
  vertical_flip_prob = trial.suggest_float ("vertical_flip_prob", 0, 1, step = 0.1)
  rotation_angle = trial.suggest_float("rotation_angle", 0, 30, step = 10)
  color_jitter_brightness = trial.suggest_float ("color_jitter_brightness", 0.2, 0.8, step = 0.1)
  #color_jitter_contrast = trial.suggest_float ("color_jitter_contrast", 0, 1, step = 0.1)
  #shift_horizontal = trial.suggest_uniform("shift_horizontal", 0, 0.2)
  #shift_vertical = trial.suggest_uniform("shift_vertical", 0, 0.2)

  train_transform = transforms.Compose([transforms.Resize (128),
                                    transforms.RandomVerticalFlip(vertical_flip_prob),
                                    transforms.RandomRotation(rotation_angle),
                                    transforms.ColorJitter(brightness=color_jitter_brightness), #contrast=color_jitter_contrast),
                                    #transforms.RandomAffine(degrees=0, translate=(shift_horizontal, shift_vertical)),
                                    transforms.Grayscale (num_output_channels = 3)])
  test_transform = transforms.Compose([transforms.Resize (128),
                                         transforms.Grayscale (num_output_channels = 3)])



  for train_indices, test_indices in skfg.split (np.zeros ((len (labels_total))), labels_total, groups_total):

#myDataset is a custom dataset class defined previously.
    train_dataset = myDataset (indices = train_indices, transform = train_transform)
    test_dataset = myDataset (indices = test_indices, transform = test_transform)


  #Here, I want to send my datasets to GPU#

    new_train_dataset, new_test_dataset = gpu_dataset_generator (train_dataset = train_dataset,
                                                               test_dataset = test_dataset,
                                                               device = device)



    train_dataloader = data.DataLoader (new_train_dataset, batch_size = 32, shuffle = True)

    test_dataloader = data.DataLoader (new_test_dataset, batch_size = 32, shuffle = False)

    loss_fn = nn.BCEWithLogitsLoss ()

###the feature extractor part of my CNN is based on the AlexNet with its weights being frozen previously 

    alexnet.classifier = nn.Sequential (nn.Linear (256 * 36, 32 * 36),
                                     nn.ReLU (),
                                     nn.Dropout (dropout_probab),
                                     nn.Linear (32 * 36, 16 * 36),
                                     nn. ReLU (),
                                     nn.Dropout (dropout_probab),
                                     nn.Linear (16 * 36, 2 * 36),
                                     nn.ReLU (),
                                     nn.Dropout (dropout_probab),
                                     nn.Linear (72, 8),
                                     nn.ReLU (),
                                     nn.Dropout (dropout_probab),
                                     nn.Linear (8, 1))

    alexnet.to (device)

    optimizer = torch.optim.Adam (alexnet.classifier.parameters (), lr = lr)

    acc = train_alexnet (model = alexnet, device = device,
                         train_dataloader = train_dataloader,
                         test_dataloader = test_dataloader,
                         num_epochs = 100, loss_fn = loss_fn,
                         optimizer = optimizer,
                         trial = trial)

    acc_folds += acc


  return acc_folds / 5

import logging
import sys

optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))

study = optuna.create_study(direction="maximize", sampler = optuna.samplers.TPESampler (),
                            pruner=optuna.pruners.SuccessiveHalvingPruner ())

study.optimize (objective, n_trials= 100)


print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
  print("    {}: {}".format(key, value))

As shown in my code, only a single objective function is defined but the accuracy that is going to be manged by the Optuna is the mean of accuracies of all folds. However, the this error is being raised every time:

/usr/local/lib/python3.10/dist-packages/optuna/trial/_trial.py:494: UserWarning: The reported value is ignored because this `step` 62 is already reported.
  warnings.warn(

it is raised for each of my 100 epochs for each trial of the Optuna. I think the logic behind the code is exactly what I want, but how can I manage this error and will it cause problems in getting an accurate result?
by reading the discussion made here, probably the error is because " As in the message, you code calls trial.report at the same step multiple times."; however, I do not understand how to deal with it.
Thank you very much in advance.

Sorry for this reply; can anyone help me dealing with this problem?

To resolve the warning message, we just need to delete trial.report line.
If we would like to use pruning feature of Optuna with cross validation, we need to report mean intermediate values: mean test_acc_epoch over cv folds only once per epoch. I think this is too costly, so I’d suggest removing the report: not using pruning feature.