Hi everyone,
I hope you are doing well.
I want to use Optuna for hyperparameter optimization of my CNN model. Also, I want to apply five-fold cross-validation for the test/train splitting phase. The solution posted by Toshihiko Yanase here, is as follows:
# This example is copied from https://github.com/optuna/optuna/blob/master/examples/pytorch_simple.py.
DEVICE = torch.device("cpu")
BATCHSIZE = 128
CLASSES = 10
DIR = os.getcwd()
EPOCHS = 10
LOG_INTERVAL = 10
N_TRAIN_EXAMPLES = BATCHSIZE * 30
N_VALID_EXAMPLES = BATCHSIZE * 10
def define_model(trial):
# We optimize the number of layers, hidden untis and dropout ratio in each layer.
n_layers = trial.suggest_int("n_layers", 1, 3)
layers = []
in_features = 28 * 28
for i in range(n_layers):
out_features = trial.suggest_int("n_units_l{}".format(i), 4, 128)
layers.append(nn.Linear(in_features, out_features))
layers.append(nn.ReLU())
p = trial.suggest_float("dropout_l{}".format(i), 0.2, 0.5)
layers.append(nn.Dropout(p))
in_features = out_features
layers.append(nn.Linear(in_features, CLASSES))
layers.append(nn.LogSoftmax(dim=1))
return nn.Sequential(*layers)
def objective(trial, train_loader, valid_loader):
# Generate the model.
model = define_model(trial).to(DEVICE)
# Generate the optimizers.
optimizer_name = trial.suggest_categorical("optimizer", ["Adam", "RMSprop", "SGD"])
lr = trial.suggest_float("lr", 1e-5, 1e-1, log=True)
optimizer = getattr(optim, optimizer_name)(model.parameters(), lr=lr)
# Training of the model.
model.train()
for epoch in range(EPOCHS):
for batch_idx, (data, target) in enumerate(train_loader):
# Limiting training data for faster epochs.
if batch_idx * BATCHSIZE >= N_TRAIN_EXAMPLES:
break
data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
optimizer.zero_grad()
output = model(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
# Validation of the model.
model.eval()
correct = 0
with torch.no_grad():
for batch_idx, (data, target) in enumerate(valid_loader):
# Limiting validation data.
if batch_idx * BATCHSIZE >= N_VALID_EXAMPLES:
break
data, target = data.view(data.size(0), -1).to(DEVICE), target.to(DEVICE)
output = model(data)
# Get the index of the max log-probability.
pred = output.argmax(dim=1, keepdim=True)
correct += pred.eq(target.view_as(pred)).sum().item()
accuracy = correct / min(len(valid_loader.dataset), N_VALID_EXAMPLES)
return accuracy
def objective_cv(trial):
# Get the MNIST dataset.
dataset = datasets.MNIST(DIR, train=True, download=True, transform=transforms.ToTensor())
fold = KFold(n_splits=3, shuffle=True, random_state=0)
scores = []
for fold_idx, (train_idx, valid_idx) in enumerate(fold.split(range(len(dataset)))):
train_data = torch.utils.data.Subset(dataset, train_idx)
valid_data = torch.utils.data.Subset(dataset, valid_idx)
train_loader = torch.utils.data.DataLoader(
train_data,
batch_size=BATCHSIZE,
shuffle=True,
)
valid_loader = torch.utils.data.DataLoader(
valid_data,
batch_size=BATCHSIZE,
shuffle=True,
)
accuracy = objective(trial, train_loader, valid_loader)
scores.append(accuracy)
return np.mean(scores)
study = optuna.create_study(direction="maximize")
study.optimize(objective_cv, n_trials=20, timeout=600)
pruned_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.PRUNED]
complete_trials = [t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]
print("Study statistics: ")
print(" Number of finished trials: ", len(study.trials))
print(" Number of pruned trials: ", len(pruned_trials))
print(" Number of complete trials: ", len(complete_trials))
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
As shown in this code snippet, two objective functions are defined, one for each fold and one for all the five folds; the trials made by the Optuna will try to enhance the mean accuracy achieved by all the five folds (objective_cv), not a single one (objective). That is very interesting; however, I want to have the same hyperparameter for all folds; for instance, I want to have the same optimizer, the same learning rate, and the same batch size for all folds, which seems more reasonable and is more convenient for reporting.
I tried this code snippet:
def train_alexnet (model, device, train_dataloader, test_dataloader, num_epochs, loss_fn, optimizer, trial):
train_losses = []
test_losses = []
train_accs = []
test_accs = []
for epoch in range (num_epochs):
model.train ()
train_loss = 0.0
num_correct = 0
for image, label in train_dataloader:
label = label.float ()
output = model (image)
loss = loss_fn (output, label.view (-1, 1))
train_loss += loss.item () * image.size (0)
optimizer.zero_grad ()
loss.backward ()
optimizer.step ()
pred = output > 0
num_correct += torch.sum (pred == label.view (-1, 1))
train_loss_epoch = train_loss / len (train_dataloader.dataset)
train_losses.append (train_loss_epoch)
train_acc_epoch = num_correct.double () / len (train_dataloader.dataset)
train_accs.append (train_acc_epoch.item ())
#Validation phase
model.eval ()
test_loss = 0.0
num_correct = 0
with torch.no_grad ():
for image, label in test_dataloader:
label = label.float ()
output = model (image)
loss = loss_fn (output, label.view (-1, 1))
test_loss += loss.item () * image.size (0)
pred = output > 0
num_correct += torch.sum (pred == label.view (-1, 1))
test_loss_epoch = test_loss / len (test_dataloader.dataset)
test_losses.append (test_loss_epoch)
test_acc_epoch = num_correct.double () / len (test_dataloader.dataset)
test_accs.append (test_acc_epoch.item ())
trial.report(test_acc_epoch, epoch)
# Handle pruning based on the intermediate value.
if trial.should_prune():
raise optuna.TrialPruned()
return test_acc_epoch
#####################################
#this function moves my whole dataset to the GPU before starting the training phase
def gpu_dataset_generator (train_dataset, test_dataset, device):
train_dataset_list = []
test_dataset_list = []
train_dataloader_cpu = data.DataLoader (train_dataset, batch_size = 64, pin_memory = True)
test_dataloader_cpu = data.DataLoader (test_dataset, batch_size = 64, pin_memory= True)
#First move the train data set
for image, label in train_dataloader_cpu:
image, label = image.to (device), label.to (device)
train_dataset_list.append (data.TensorDataset (image, label))
#Then move the test dataset
for image, label in test_dataloader_cpu:
image, label = image.to (device), label.to (device)
test_dataset_list.append (data.TensorDataset (image, label))
train_dataset_gpu = data.ConcatDataset ((train_dataset_list))
test_dataset_gpu = data.ConcatDataset ((test_dataset_list))
return train_dataset_gpu, test_dataset_gpu
###############################
def objective (trial):
n_folds = 5
skfg = StratifiedGroupKFold (n_splits = n_folds, shuffle = True, random_state = 42)
labels_total = csv_data.iloc [:, 0]
groups_total = csv_data.iloc [:, 2]
acc_folds = 0
lr = trial.suggest_loguniform ('lr', 1e-5, 1e-3)
dropout_probab = trial.suggest_float ('dropout_probab', 0.5, 0.9, step = 0.1)
vertical_flip_prob = trial.suggest_float ("vertical_flip_prob", 0, 1, step = 0.1)
rotation_angle = trial.suggest_float("rotation_angle", 0, 30, step = 10)
color_jitter_brightness = trial.suggest_float ("color_jitter_brightness", 0.2, 0.8, step = 0.1)
#color_jitter_contrast = trial.suggest_float ("color_jitter_contrast", 0, 1, step = 0.1)
#shift_horizontal = trial.suggest_uniform("shift_horizontal", 0, 0.2)
#shift_vertical = trial.suggest_uniform("shift_vertical", 0, 0.2)
train_transform = transforms.Compose([transforms.Resize (128),
transforms.RandomVerticalFlip(vertical_flip_prob),
transforms.RandomRotation(rotation_angle),
transforms.ColorJitter(brightness=color_jitter_brightness), #contrast=color_jitter_contrast),
#transforms.RandomAffine(degrees=0, translate=(shift_horizontal, shift_vertical)),
transforms.Grayscale (num_output_channels = 3)])
test_transform = transforms.Compose([transforms.Resize (128),
transforms.Grayscale (num_output_channels = 3)])
for train_indices, test_indices in skfg.split (np.zeros ((len (labels_total))), labels_total, groups_total):
#myDataset is a custom dataset class defined previously.
train_dataset = myDataset (indices = train_indices, transform = train_transform)
test_dataset = myDataset (indices = test_indices, transform = test_transform)
#Here, I want to send my datasets to GPU#
new_train_dataset, new_test_dataset = gpu_dataset_generator (train_dataset = train_dataset,
test_dataset = test_dataset,
device = device)
train_dataloader = data.DataLoader (new_train_dataset, batch_size = 32, shuffle = True)
test_dataloader = data.DataLoader (new_test_dataset, batch_size = 32, shuffle = False)
loss_fn = nn.BCEWithLogitsLoss ()
###the feature extractor part of my CNN is based on the AlexNet with its weights being frozen previously
alexnet.classifier = nn.Sequential (nn.Linear (256 * 36, 32 * 36),
nn.ReLU (),
nn.Dropout (dropout_probab),
nn.Linear (32 * 36, 16 * 36),
nn. ReLU (),
nn.Dropout (dropout_probab),
nn.Linear (16 * 36, 2 * 36),
nn.ReLU (),
nn.Dropout (dropout_probab),
nn.Linear (72, 8),
nn.ReLU (),
nn.Dropout (dropout_probab),
nn.Linear (8, 1))
alexnet.to (device)
optimizer = torch.optim.Adam (alexnet.classifier.parameters (), lr = lr)
acc = train_alexnet (model = alexnet, device = device,
train_dataloader = train_dataloader,
test_dataloader = test_dataloader,
num_epochs = 100, loss_fn = loss_fn,
optimizer = optimizer,
trial = trial)
acc_folds += acc
return acc_folds / 5
import logging
import sys
optuna.logging.get_logger("optuna").addHandler(logging.StreamHandler(sys.stdout))
study = optuna.create_study(direction="maximize", sampler = optuna.samplers.TPESampler (),
pruner=optuna.pruners.SuccessiveHalvingPruner ())
study.optimize (objective, n_trials= 100)
print("Best trial:")
trial = study.best_trial
print(" Value: ", trial.value)
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
As shown in my code, only a single objective function is defined but the accuracy that is going to be manged by the Optuna is the mean of accuracies of all folds. However, the this error is being raised every time:
/usr/local/lib/python3.10/dist-packages/optuna/trial/_trial.py:494: UserWarning: The reported value is ignored because this `step` 62 is already reported.
warnings.warn(
it is raised for each of my 100 epochs for each trial of the Optuna. I think the logic behind the code is exactly what I want, but how can I manage this error and will it cause problems in getting an accurate result?
by reading the discussion made here, probably the error is because " As in the message, you code calls trial.report
at the same step multiple times."; however, I do not understand how to deal with it.
Thank you very much in advance.