Model accuracy&loss remains the same for every trial in optuna

Faruk_Celik · May 2, 2024, 2:21am

Hello, my binary text classification training does not improve accuracy& loss. Thanks for any help!

# Define the objective function
def objective(trial):
    
    model = nn.Sequential(
        nn.Linear(500, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.Linear(128, 5),
        nn.Softmax(dim=1)
    ).to(device)    
    
    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3])
    beta_1 = trial.suggest_categorical('beta_1', [0.8, 0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6])
    batch_size = 16
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    
    
    num_epochs = 10
    accurasies = []

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        for batch_idx, batch in enumerate(train_loader, 1):
            optimizer.zero_grad()
            input_ids, _, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model(input_ids.float())
            max_indices = torch.argmax(outputs, dim=1)
            loss = loss_fn(max_indices.float(), labels.float())
            loss.requires_grad = True
            loss.backward()
            optimizer.step()
            
            
            
            
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Epoch {epoch + 1} training completed in time: {elapsed_time}")

        model.eval()
        val_loss = 0.0
        total = 0
        correct = 0

        for batch_idx, batch in enumerate(val_loader, 1):
            with torch.no_grad():
                input_ids, _, labels = batch
                input_ids, labels = input_ids.to(device), labels.to(device)
                
                outputs = model(input_ids.float())
                max_indices = torch.argmax(outputs, dim=1)
                max_indices_float32 = max_indices
                
                val_loss += loss_fn(max_indices.float(), labels.float())
                total += labels.size(0)
                correct += (max_indices_float32 == labels).sum().item()
                
        accurasies.append(correct/ total)
        print(f"accuracy: {(correct/ total)}")
    
    print(f"accurasies: {str(accurasies)}")
    trial.set_user_attr("val_loss", val_loss)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'batch_size': batch_size})
        
        
    val_loss /= len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss}, Accuracy: {correct/ total}")
    print(f"Used hyperparameters: {{'learning_rate': {learning_rate}, 'beta_1': {beta_1}, 'beta_2': {beta_2}, 'epsilon': {epsilon}, 'batch_size': {batch_size}}}")
    return val_loss


# Create a study object and optimize the objective function
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=10)
best_trial = study.best_trial
print(f"Best trial number: {best_trial.number}")
print(f"Best trial validation loss: {best_trial.user_attrs['val_loss']}")
print(f"Best trial hyperparameters: {best_trial.params}")

ptrblck · May 2, 2024, 4:48pm

Your code contains a few issues:

Binary classification use cases would use nn.BCEWithLogitsLoss accepting a model output containing logits in the shape [batch_size, 1] or nn.CrossEntropyLoss containing logins for two classes in the shape [batch_size, 2] (actually this would be a 2-class multi-class classification). Your model returns 5 logits, so it’s unclear what these correspond to.
nn.BCEWithLogitsLoss expects raw logits and you would thus need to remove the nn.Softmax.
Integer tensors are not usefully differentiable. Using torch.argmax(output, 1) will detach the tensor. Calling loss.requires_grad = True only masks the error and the model parameters will still not receive any valid gradients.

Faruk_Celik · May 2, 2024, 8:52pm

Thanks for your reply. I changed some thing however just like you said, removing
loss.required_grad = True led to an error . “RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn”.

# Define the objective function
def objective(trial):
    
    model = nn.Sequential(
        nn.Linear(500, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.Linear(128, 2)
    ).to(device)    
    
    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3])
    beta_1 = trial.suggest_categorical('beta_1', [0.8, 0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6])
    batch_size = 16
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = torch.nn.BCEWithLogitsLoss()
    
    
    
    num_epochs = 10
    accurasies = []

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        for batch_idx, batch in enumerate(train_loader, 1):
            optimizer.zero_grad()
            input_ids, _, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model(input_ids.float())
            outputs = torch.argmax(outputs, 1)
            
            loss = loss_fn(outputs.float(), labels.float())
            
            loss.backward()
            optimizer.step()
            
            
            
            
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Epoch {epoch + 1} training completed in time: {elapsed_time}")

        model.eval()
        val_loss = 0.0
        total = 0
        correct = 0

        for batch_idx, batch in enumerate(val_loader, 1):
            with torch.no_grad():
                input_ids, _, labels = batch
                input_ids, labels = input_ids.to(device), labels.to(device)
                
                outputs = model(input_ids.float())
                outputs = torch.argmax(outputs, 1)
                
                val_loss += loss_fn(outputs.float(), labels.float())
                total += labels.size(0)
                correct += (max_indices_float32 == labels).sum().item()
                
        accurasies.append(correct/ total)
        print(f"accuracy: {(correct/ total)}")
    
    print(f"accurasies: {str(accurasies)}")
    trial.set_user_attr("val_loss", val_loss)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'batch_size': batch_size})
        
        
    val_loss /= len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss}, Accuracy: {correct/ total}")
    print(f"Used hyperparameters: {{'learning_rate': {learning_rate}, 'beta_1': {beta_1}, 'beta_2': {beta_2}, 'epsilon': {epsilon}, 'batch_size': {batch_size}}}")
    return val_loss

Faruk_Celik · May 3, 2024, 2:39am

I figured it out. I changed the loss function and removed argmax.

# Define the objective function
def objective(trial):
    
    model = nn.Sequential(
        nn.Linear(500, 512),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Dropout(0.5),
        nn.Linear(256, 128),
        nn.Linear(128, 2),
        nn.Softmax(dim=1)
    ).to(device)    
    
    # Hyperparameters to tune
    learning_rate = trial.suggest_categorical('learning_rate', [1e-5, 1e-4, 1e-3])
    beta_1 = trial.suggest_categorical('beta_1', [0.8, 0.85, 0.9, 0.95])
    beta_2 = trial.suggest_categorical('beta_2', [0.995, 0.996, 0.997, 0.998, 0.999])
    epsilon = trial.suggest_categorical('epsilon', [1e-8, 1e-7, 1e-6])
    batch_size = 16
    
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, betas=(beta_1, beta_2), eps=epsilon)
    loss_fn = torch.nn.CrossEntropyLoss()
    
    
    
    num_epochs = 10
    accurasies = []

    # Train the model
    for epoch in range(num_epochs):
        model.train()
        start_time = time.time()
        for batch_idx, batch in enumerate(train_loader, 1):
            optimizer.zero_grad()
            input_ids, _, labels = batch
            input_ids, labels = input_ids.to(device), labels.to(device)
            outputs = model(input_ids.float())
            
            loss = loss_fn(outputs, labels)
            
            loss.backward()
            optimizer.step()
            
            if batch_idx%1000 == 0:
                end_time = time.time()
                elapsed_time = end_time - start_time
                print(f"batch {batch_idx}/{len(train_loader)} completed in time: {elapsed_time}")
            
        
        
        print(f"Epoch {epoch + 1} training completed")

        model.eval()
        val_loss = 0.0
        total = 0
        correct = 0

        for batch_idx, batch in enumerate(val_loader, 1):
            with torch.no_grad():
                input_ids, _, labels = batch
                input_ids, labels = input_ids.to(device), labels.to(device)
                
                outputs = model(input_ids.float())
                
                val_loss += loss_fn(outputs, labels)
                total += labels.size(0)
                max_indice = torch.argmax(outputs, dim=1)
                correct += (max_indice == labels).sum().item()
                
        accurasies.append(correct/ total)
        print(f"accuracy: {(correct/ total)}")
    
    print(f"accurasies: {str(accurasies)}")
    trial.set_user_attr("val_loss", val_loss)
    trial.set_user_attr("model", model)
    trial.set_user_attr("hyperparameters", {'learning_rate': learning_rate, 'beta_1': beta_1, 'beta_2': beta_2, 'epsilon': epsilon, 'batch_size': batch_size})
        
        
    val_loss /= len(val_loader)
    print(f"Epoch {epoch + 1}, Validation Loss: {val_loss}, Accuracy: {correct/ total}")
    print(f"Used hyperparameters: {{'learning_rate': {learning_rate}, 'beta_1': {beta_1}, 'beta_2': {beta_2}, 'epsilon': {epsilon}, 'batch_size': {batch_size}}}")
    return val_loss