I am getting same accuracy and loss for all the epochs. I tried every thing, so please help me. I am attaching my model and train method code. I am trying to do binary text classification

Model Code

class BertClassifier(nn.Module):

def __init__(self, dropout=0.5):

    super(BertClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-cased', from_tf=False)
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768, 1)
    self.relu = nn.Sigmoid()

def forward(self, input_id, mask):

    _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
    dropout_output = self.dropout(pooled_output)
    linear_output = self.linear(dropout_output)
    final_layer = self.relu(linear_output)

    return final_layer

DataLoading

train, val = Dataset(df_train), Dataset(df_val)

train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True)

val_dataloader = torch.utils.data.DataLoader(val, batch_size=32)

Train method

def train(model, train_data, val_data, learning_rate, epochs):

# train, val = Dataset(train_data), Dataset(val_data)

# train_dataloader = torch.utils.data.DataLoader(train, batch_size=16, shuffle=True)
# val_dataloader = torch.utils.data.DataLoader(val, batch_size=16)

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr= learning_rate)
trainlen=24000
vallen=3000

if use_cuda:

        model = model.cuda()
        criterion = criterion.cuda()

for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0
        y_pred=[]
        y_true=[]

        for train_input, train_label in tqdm(train_data):

            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)
            
            output = model(input_id, mask)
            #_, predt = torch.max(output, 1)
            
            batch_loss = criterion(output.float(), train_label.float().unsqueeze(1))
            total_loss_train += batch_loss.item()

            #print(output, train_label)
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
            # pred = np.round(output.detach())
            # target = np.round(train_label.detach())             
            # y_pred.extend(pred.tolist())
            # y_true.extend(target.tolist())
    
            #batch_loss= batch_loss(batch_loss, requires_grad=True)
            optimizer.zero_grad()
            #batch_loss.requires_grad=True
            batch_loss.backward() 
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        with torch.no_grad():
            for val_input, val_label in val_dataloader:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                #_, predt = torch.max(output, 1)

                batch_loss = criterion(output.float(), val_label.float().unsqueeze(1))
                total_loss_val += batch_loss.item()
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / int(trainlen): .3f} \
            | Train Accuracy: {total_acc_train / int(trainlen): .3f} \
            | {total_acc_train} and {trainlen}\
            | Val Loss: {total_loss_val / int(vallen): .3f} \
            | Val Accuracy: {total_acc_val / int(vallen): .3f}\
            | {total_acc_val} and {vallen}'
            )

EPOCHS = 5
model = BertClassifier()
LR = 0.001

train(model, train_dataloader, val_dataloader, LR, EPOCHS)

The reason i am using trainlen value saperaletly while calculating accuracy because if take the length value of trainloader it is giving me the number of batches value.

Remove the usage of the sigmoid activation applied on the output and use nn.BCEWithLogitsLoss for better numerical stability. Afterwards, try to overfit your model on a small subset of your dataset (e.g. just 10 samples) by playing around with some hyperparameters and make sure your model is able to overfit these samples.

Hi ptrblck thank you for answering. But If i remove sigmoid activation then how model is gonna predict the output. As we have 2 nuerons in linear layer Which wont have probabilities right if sigmoid is removed. And I am not even sure how the model is training as i am not sending any train labels to the model. Do you have any idea what is going on.

I have one more question when we do Binary Classification Shouldn’t the num_classes value be 1 right instaed of 2? LIKE IN BELOW CODE in linear layer should it be (768,1) instead of 2.

class BertClassifier(nn.Module):

def __init__(self, dropout=0.1):

    super(BertClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768,2)
    self.relu = nn.Sigmoid()

def forward(self, input_id, mask):

    _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
    dropout_output = self.dropout(pooled_output)
    #dropout_output = dropout_output.view(-1, 768)
    linear_output = self.linear(dropout_output)
    final_layer = self.relu(linear_output)

    return final_layer

APPRECIATE YOUR RESPONSE!

Your model will return raw logits in this case. nn.BCEWithLogitsLoss expects logits and will apply logsigmoid internally for you.

The model’s forward doesn’t need the targets and you are passing the targets to the criterion to compute the loss.

Yes, using a single output neuron is the common approach for a binary classification use case.
You could also use 2 outputs and treat it as a 2-class multi-class classification, but would then need to use another criterion, e.g. nn.CrossEntropyLoss.

Thank You for all the answers ptrblck. Now I have much more clarity. Can you also give me little bit suggestion on how to increase the accuracy(right now it is same for all the epochs). I tried with BCELogitsLoss too still it is same.

Did you try to overfit the small dataset (just pick 10 samples) as recommended before? Your model should be able to overfit this tiny dataset assuming no other issues are in the training script.

Yeah I kept and the accuracy is same as previous time (which is 50) , and whatever the learning rate i keep accuracy is same for all the epochs.

Data Loading

train, val = Dataset(df_train), Dataset(df_val)

train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)

val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

Model

class BertClassifier(nn.Module):

def __init__(self, dropout=0.1):

    super(BertClassifier, self).__init__()

    self.bert = BertModel.from_pretrained('bert-base-uncased')
    self.dropout = nn.Dropout(dropout)
    self.linear = nn.Linear(768,1)
    self.relu = nn.Sigmoid()

def forward(self, input_id, mask):

    _, pooled_output = self.bert(input_ids= input_id, attention_mask=mask,return_dict=False)
    dropout_output = self.dropout(pooled_output)
    linear_output = self.linear(dropout_output)
    final_layer = self.relu(linear_output)

    return final_layer

Train Method

def train(model, train_data, val_data, learning_rate, epochs):

# train, val = Dataset(train_data), Dataset(val_data)

# train_dataloader = torch.utils.data.DataLoader(train, batch_size=32, shuffle=True) 'len(train_data)' len(val_data)
# val_dataloader = torch.utils.data.DataLoader(val, batch_size=32) 

trainlen=24
vallen=3

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

criterion = nn.BCELoss()
optimizer = Adam(model.parameters(), lr= learning_rate)


if use_cuda:

        model = model.cuda()
        criterion = criterion.cuda()

for epoch_num in range(epochs):

        total_acc_train = 0
        total_loss_train = 0
        model.train()
        # y_pred=[]
        # y_true=[]

        for train_input, train_label in tqdm(train_data):


            train_label = train_label.to(device)
            mask = train_input['attention_mask'].to(device)
            input_id = train_input['input_ids'].squeeze(1).to(device)

            optimizer.zero_grad()
            
            output = model(input_id, mask)
            #_, pred = torch.max(output, 1)
            #print(output.dtype)
            #print(train_label.dtype)
            batch_loss = criterion(output, train_label.unsqueeze(1).float())
            total_loss_train += batch_loss.item()

            #print(output, train_label)
            
            acc = (output.argmax(dim=1) == train_label).sum().item()
            total_acc_train += acc
             
            #batch_loss.requires_grad=True
            batch_loss.backward() 
            optimizer.step()
        
        total_acc_val = 0
        total_loss_val = 0

        model.eval()

        with torch.no_grad():
            for val_input, val_label in val_data:

                val_label = val_label.to(device)
                mask = val_input['attention_mask'].to(device)
                input_id = val_input['input_ids'].squeeze(1).to(device)

                output = model(input_id, mask)
                #_, pred = torch.max(output, 1)

                batch_loss = criterion(output.float(), val_label.unsqueeze(1).float())
                total_loss_val += batch_loss.item()
                acc = (output.argmax(dim=1) == val_label).sum().item()
                total_acc_val += acc
        
        print(
            f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / trainlen : .3f} \
            | Train Accuracy: {total_acc_train / trainlen : .3f} \
            | {total_acc_train} and {trainlen}\
            | Val Loss: {total_loss_val / vallen: .3f} \
            | Val Accuracy: {total_acc_val / vallen: .3f}\
            | {total_acc_val} and {vallen}'
            )

EPOCHS = 5
model = BertClassifier()
LR = 2e-5

train(model,train_dataloader, val_dataloader, LR, EPOCHS)

And I also tried same method with BCEWithLogitLoss and no sigmoid it still does not change.

Is something wrong with the training method. It looks like it is not training at all or updating the weights maybe. I dont know.