Model accuracy&loss remains the same for every trial in optuna

Hello, my binary text classification training does not improve accuracy& loss. Thanks for any help!

# Define the objective function
def objective(trial):
    
    model = nn.Sequential(
        nn.Linear(500, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.Linear(128, 5),
        nn.Softmax(dim=1)
    ).to(device)    
    
    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3])
    beta_1 = trial.suggest_categorical('beta_1', [0.8, 0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6])
    batch_size = 16
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    
    
    num_epochs = 10
    accurasies = []

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        for batch_idx, batch in enumerate(train_loader, 1):
            optimizer.zero_grad()
            input_ids, _, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model(input_ids.float())
            max_indices = torch.argmax(outputs, dim=1)
            loss = loss_fn(max_indices.float(), labels.float())
            loss.requires_grad = True
            loss.backward()
            optimizer.step()
            
            
            
            
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Epoch {epoch + 1} training completed in time: {elapsed_time}")

        model.eval()
        val_loss = 0.0
        total = 0
        correct = 0

        for batch_idx, batch in enumerate(val_loader, 1):
            with torch.no_grad():
                input_ids, _, labels = batch
                input_ids, labels = input_ids.to(device), labels.to(device)
                
                outputs = model(input_ids.float())
                max_indices = torch.argmax(outputs, dim=1)
                max_indices_float32 = max_indices
                
                val_loss += loss_fn(max_indices.float(), labels.float())
                total += labels.size(0)
                correct += (max_indices_float32 == labels).sum().item()
                
        accurasies.append(correct/ total)
        print(f"accuracy: {(correct/ total)}")
    
    print(f"accurasies: {str(accurasies)}")
    trial.set_user_attr("val_loss", val_loss)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'batch_size': batch_size})
        
        
    val_loss /= len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss}, Accuracy: {correct/ total}")
    print(f"Used hyperparameters: {{'learning_rate': {learning_rate}, 'beta_1': {beta_1}, 'beta_2': {beta_2}, 'epsilon': {epsilon}, 'batch_size': {batch_size}}}")
    return val_loss


# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
best_trial = study.best_trial
print(f"Best trial number: {best_trial.number}")
print(f"Best trial validation loss: {best_trial.user_attrs['val_loss']}")
print(f"Best trial hyperparameters: {best_trial.params}")

Your code contains a few issues:

  • Binary classification use cases would use nn.BCEWithLogitsLoss accepting a model output containing logits in the shape [batch_size, 1] or nn.CrossEntropyLoss containing logins for two classes in the shape [batch_size, 2] (actually this would be a 2-class multi-class classification). Your model returns 5 logits, so it’s unclear what these correspond to.
  • nn.BCEWithLogitsLoss expects raw logits and you would thus need to remove the nn.Softmax.
  • Integer tensors are not usefully differentiable. Using torch.argmax(output, 1) will detach the tensor. Calling loss.requires_grad = True only masks the error and the model parameters will still not receive any valid gradients.

Thanks for your reply. I changed some thing however just like you said, removing
loss.required_grad = True led to an error . “RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn”.

# Define the objective function
def objective(trial):
    
    model = nn.Sequential(
        nn.Linear(500, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.Linear(128, 2)
    ).to(device)    
    
    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3])
    beta_1 = trial.suggest_categorical('beta_1', [0.8, 0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6])
    batch_size = 16
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    
    
    num_epochs = 10
    accurasies = []

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        for batch_idx, batch in enumerate(train_loader, 1):
            optimizer.zero_grad()
            input_ids, _, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model(input_ids.float())
            outputs = torch.argmax(outputs, 1)
            
            loss = loss_fn(outputs.float(), labels.float())
            
            loss.backward()
            optimizer.step()
            
            
            
            
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Epoch {epoch + 1} training completed in time: {elapsed_time}")

        model.eval()
        val_loss = 0.0
        total = 0
        correct = 0

        for batch_idx, batch in enumerate(val_loader, 1):
            with torch.no_grad():
                input_ids, _, labels = batch
                input_ids, labels = input_ids.to(device), labels.to(device)
                
                outputs = model(input_ids.float())
                outputs = torch.argmax(outputs, 1)
                
                val_loss += loss_fn(outputs.float(), labels.float())
                total += labels.size(0)
                correct += (max_indices_float32 == labels).sum().item()
                
        accurasies.append(correct/ total)
        print(f"accuracy: {(correct/ total)}")
    
    print(f"accurasies: {str(accurasies)}")
    trial.set_user_attr("val_loss", val_loss)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'batch_size': batch_size})
        
        
    val_loss /= len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss}, Accuracy: {correct/ total}")
    print(f"Used hyperparameters: {{'learning_rate': {learning_rate}, 'beta_1': {beta_1}, 'beta_2': {beta_2}, 'epsilon': {epsilon}, 'batch_size': {batch_size}}}")
    return val_loss

I figured it out. I changed the loss function and removed argmax.

# Define the objective function
def objective(trial):
    
    model = nn.Sequential(
        nn.Linear(500, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.Linear(128, 2),
        nn.Softmax(dim=1)
    ).to(device)    
    
    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3])
    beta_1 = trial.suggest_categorical('beta_1', [0.8, 0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6])
    batch_size = 16
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    
    
    num_epochs = 10
    accurasies = []

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        for batch_idx, batch in enumerate(train_loader, 1):
            optimizer.zero_grad()
            input_ids, _, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model(input_ids.float())
            
            loss = loss_fn(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            if batch_idx%1000 == 0:
                end_time = time.time()
                elapsed_time = end_time - start_time
                print(f"batch {batch_idx}/{len(train_loader)} completed in time: {elapsed_time}")
            
        
        
        print(f"Epoch {epoch + 1} training completed")

        model.eval()
        val_loss = 0.0
        total = 0
        correct = 0

        for batch_idx, batch in enumerate(val_loader, 1):
            with torch.no_grad():
                input_ids, _, labels = batch
                input_ids, labels = input_ids.to(device), labels.to(device)
                
                outputs = model(input_ids.float())
                
                val_loss += loss_fn(outputs, labels)
                total += labels.size(0)
                max_indice = torch.argmax(outputs, dim=1)
                correct += (max_indice == labels).sum().item()
                
        accurasies.append(correct/ total)
        print(f"accuracy: {(correct/ total)}")
    
    print(f"accurasies: {str(accurasies)}")
    trial.set_user_attr("val_loss", val_loss)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'batch_size': batch_size})
        
        
    val_loss /= len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss}, Accuracy: {correct/ total}")
    print(f"Used hyperparameters: {{'learning_rate': {learning_rate}, 'beta_1': {beta_1}, 'beta_2': {beta_2}, 'epsilon': {epsilon}, 'batch_size': {batch_size}}}")
    return val_loss