Stuck at a constant accuracy in binary classification

import pandas as pd
import torch
import numpy as np

import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from torch.utils.data import Dataset,DataLoader

def data_preprocessing(task_1a_dataframe):
##loading data
encoded_dataframe = pd.read_csv(“/content/task_1a_dataset.csv”)

labeling binary data

le = LabelEncoder()
encoded_dataframe[‘Gender’] = le.fit_transform(encoded_dataframe[‘Gender’])
encoded_dataframe[‘EverBenched’] = le.fit_transform(encoded_dataframe[‘EverBenched’])

one-hot-encoding

ct = ColumnTransformer(transformers=[(‘encoder’,OneHotEncoder(), [0,2])], remainder=‘passthrough’)
encoded_dataframe = pd.DataFrame(ct.fit_transform(encoded_dataframe))

renaming the headers

headers = [‘Bachelors’,‘Masters’,‘PhD’,‘Bangalore’,‘New Delhi’,‘Pune’,‘JoiningYear’,‘PaymentTier’,‘Age’,‘Gender’,‘EverBenched’,‘ExperienceInCurrentDomain’,‘LeaveOrNot’]
encoded_dataframe.columns=headers

encoded_dataframe=df

return encoded_dataframe

def identify_features_and_targets(encoded_dataframe):
X=encoded_dataframe.iloc[:,:-1]
y=encoded_dataframe.iloc[:,[-1]]

features_and_targets = [[X.values],[y.values]]

return features_and_targets

def load_as_tensors(features_and_targets):
X_train, X_test, y_train, y_test =train_test_split(features_and_targets[0][0], features_and_targets[1][0], test_size = 0.2, random_state = 42)
sc = StandardScaler()

X_train[:,[7,8,11] ] = sc.fit_transform(X_train[:,[7,8,11]])
X_test[:,[7,8,11]] = sc.transform(X_test[:,[7,8,11]])

X_train = torch.from_numpy(X_train)
y_train = torch.from_numpy(y_train)
X_test = torch.from_numpy(X_test)
y_test = torch.from_numpy(y_test)


X_train = X_train.to(torch.float32)
y_train = y_train.to(torch.float32)
X_test = X_test.to(torch.float32)
y_test = y_test.to(torch.float32)

train_loader_X =  DataLoader(dataset=X_train ,batch_size = 32 ,shuffle =True , num_workers=2)    ## num workers 0 means that all the data will be loaded in the main proces
train_loader_y =  DataLoader(dataset=y_train ,batch_size = 32 ,shuffle =True , num_workers=2)    ## num workers 0 means that all the data will be loaded in the main proces
test_loader_X=  DataLoader(dataset=X_test ,batch_size = 32 ,shuffle =True , num_workers=2)    ## num workers 0 means that all the data will be loaded in the main proces
test_loader_y=  DataLoader(dataset=y_test ,batch_size = 32 ,shuffle =True , num_workers=2)    ## num workers 0 means that all the data will be loaded in the main proces

## creating iterables
train_iter_X = iter(train_loader_X)
train_iter_y = iter(train_loader_y)
test_iter_y = iter(test_loader_y)
test_iter_X = iter(test_loader_X)

## storing iterables in a list
iter_lst = [train_loader_X,test_loader_X,train_loader_y,test_loader_y]

tensors_and_iterable_training_data = [X_train , X_test , y_train , y_test , iter_lst]

return tensors_and_iterable_training_data

class Salary_Predictor(nn.Module):
def init(self):
super(Salary_Predictor, self).init()
self.l1 = nn.Linear(12, 1024)
self.bn1 = nn.BatchNorm1d(1024)
self.relu = nn.ReLU()
self.l2 = nn.Linear(1024, 1024)
self.bn2 = nn.BatchNorm1d(1024)
self.tanh = nn.Tanh()
self.l3 = nn.Linear(1024, 512)
self.bn3 = nn.BatchNorm1d(512)
self.l4 = nn.Linear(512, 1)
self.sigmoid = nn.Sigmoid()

def forward(self, x):
    out = self.l1(x)
    out = self.bn1(out)
    out = self.tanh(out)

    out = self.l2(out)
    out = self.bn2(out)
    out = self.tanh(out)

    out = self.l3(out)
    out = self.bn3(out)
    out = self.relu(out)

    out = self.l4(out)
    # out = self.sigmoid(out)


    predicted_output = out



    return predicted_output

def model_loss_function():
loss_function =nn.BCEWithLogitsLoss()
return loss_function

def model_optimizer(model):
optimizer = torch.optim.Adamax(model.parameters(), lr =0.01)

return optimizer

def model_number_of_epochs():
number_of_epochs = 15

return number_of_epochs

def training_function(model, number_of_epochs, tensors_and_iterable_training_data, loss_function, optimizer):
trained_model = Salary_Predictor()
loss = model_loss_function()
optimizer = model_optimizer(trained_model)
n_correct = 0
n_samples = 0

for epoch in range(model_number_of_epochs()):

  train_loader_X=tensors_and_iterable_training_data[4][0]
  train_loader_y=tensors_and_iterable_training_data[4][2]
  train_iter_X = iter(train_loader_X)
  train_iter_y = iter(train_loader_y)
  sum = 0
#   print(epoch)
#   print(len(train_loader_X))
  for i in range(len(train_loader_X)):
    X_data = next(train_iter_X)
    y_data = next(train_iter_y)

    ## no need of reshaping

    ## prediction
    y_pred = trained_model(X_data)


    ls = loss(y_pred , y_data)


    ls.backward()
    optimizer.step()
    optimizer.zero_grad()

    n_samples += y_pred.shape[0]
    outputs = (y_pred >= 0.4).to(torch.float32)
    n_correct += (outputs == y_data).sum().item()

    sum += outputs.sum()
    acc = n_correct/n_samples
    if (i+1)%58 == 0:
     print(f'epoch:{epoch} || acc:{acc} || ls:{ls} || positive:{sum}')
return trained_model

def validation_function(trained_model, tensors_and_iterable_training_data):
test_loader_X = tensors_and_iterable_training_data[4][1]
test_loader_y = tensors_and_iterable_training_data[4][3]
test_iter_X = iter(test_loader_X)
test_iter_y = iter(test_loader_y)
n_correct = 0
n_samples = 0
sum = 0
loss = model_loss_function()
for i in range(len(test_loader_X)):
X_data = next(test_iter_X)
y_data = next(test_iter_y)

    output = trained_model(X_data)
    output = (output>=0.5).to(torch.float32)
    # y_data = y_data.reshape(y_data.shape[0])

    ls = loss(output,y_data)

    n_samples += y_data.shape[0]
    n_correct += (output == y_data).sum().item()

    sum += output.sum()
    print(f'loss:{ls} || positive:{sum}')


model_accuracy = 100*(n_correct/n_samples)
return model_accuracy

device = torch.device(‘cuda’ if torch.cuda.is_available else ‘cpu’)

task_1a_dataframe = pd.read_csv(‘/content/task_1a_dataset.csv’)

encoded_dataframe = data_preprocessing(task_1a_dataframe)

features_and_targets = identify_features_and_targets(encoded_dataframe)

tensors_and_iterable_training_data = load_as_tensors(features_and_targets)

model = Salary_Predictor()

loss_function = model_loss_function()
optimizer = model_optimizer(model)
number_of_epochs = model_number_of_epochs()

trained_model = training_function(model, number_of_epochs, tensors_and_iterable_training_data,
loss_function, optimizer)
model_accuracy = validation_function(trained_model,tensors_and_iterable_training_data)
print(f"Accuracy on the test set = {model_accuracy}")

I am getting stuck at an accuracy of around 64-65% in both training and testing.
I did some digging and found that the model is tending to learn so to make all the outputs as 0(instead of some 0 and some 1).
So when all my predictions come out to be 0, it it obvious that my accracy will remain constant.

Any solution to this regarding how can i avoid the direction towards which my model is tending to learn.

Hi Aditya!

The short story: You should most likely be thresholding against zero when
you convert the output of your model to “hard” yes-no predictions. (You are
thresholding against one half.)

I don’t really follow your code, but I do have some comments.

I don’t see where or whether you are using one-hot encoding in your actual
training, but if you are, it’s probably a mistake.

This looks reasonable for a simple model (depending on your use case, of
course). Note that the call to sigmoid() is commented out so the output
of your model is the output of the Linear layer l4. This is what you want.
(See further discussion below.)

As a aside, conventional wisdom holds that the bias in a Linear layer that
precedes a BatchNorm doesn’t do any good because the BatchNorm undoes
the effect of the bias. (But the bias shouldn’t hurt, and I doubt that this is
relevant to your issue.)

This is perfectly appropriate for a binary-classification problem (but you do
not want to use one-hot encoding on the ground-truth target you pass into
BCEWithLogitsLoss).

Adamax could be a good optimizer choice for your use case, but when you’re
having trouble with training, I always recommend starting first with plain-vanilla
SGD with a small learning rate. Get that working first, even if it trains slowly,
and then try out fancier optimizers.

You haven’t said how many samples are in your training set – that is, how big
an epoch is – but, as a general rule, fifteen epochs isn’t very much at all. I would
suggest training much longer before concluding that your training is stuck.

You are using BCEWithLogitsLoss (and y_pred is the output of the final
Linear layer of your model – the call to sigmoid() is commented out), so
y_pred is a batch of logits that run from -inf to inf. Therefore, when
converting y_pred to “hard” yes-no predictions you would not typically
threshold them against 0.4 (which is close to 0.5), but rather against 0.0.
A logit of zero corresponds to a probability of one half, and a common
thresholding strategy is to say that a predicted probability of less than one
half – for which the corresponding logit is less than zero – means a hard
prediction of “no.”

Now you’re thresholding against 0.5, but, again, you probably want to be
thresholding the logits against zero.

If I understand you correctly, your outputs after thresholding your y_pred
against 0.4 (and 0.5)
contains a lot of zeros. But what does your y_pred
look like? Maybe it’s okay.

Your y_pred is (trained to be) logits. When you threshold your logits against
a number greater than zero (such as 0.4) you will get a lot more 0 predictions
than you would had you thresholded against zero.

First try thresholding against zero. If things still look funny, follow up on my
other comments. If you still have issues, show us some detail about how
your loss and accuracy for both your training and validation sets evolve during
training (and training for significantly more than fifteen epochs).

Good luck.

K. Frank