Models fail to train on simple classification problem, any hints?

I’ve tried so far a couple of sequence classifiers like Bert and Roberta and I’m unable to make them overfit or even make them learn the following toy dataset problem.

Any suggestions on what/why they don’t train on this simple toy task?

It’s a simple binary classification problems shouldn’t this be easy to learn for such models?

import torch
import random
import numpy as np
import pandas as pd
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset
from transformers.trainer import Trainer, TrainingArguments
from transformers.data.data_collator import DataCollatorWithPadding
from transformers.models.auto import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification

def generate_addition_data(num_samples):
    data = []
    for _ in range(num_samples):
        a = random.randint(0, 999)
        b = random.randint(0, 999)
        question = f"{a} + {b}"
        correct_answer = a + b
        
        is_correct = random.choice([True, False])
        
        if is_correct:
            answer = correct_answer
        else:
            # Generate an incorrect answer by adding or subtracting a random value
            answer = correct_answer + random.choice([-10, -5, -1, 1, 5, 10])

        data.append({
            'question': question,
            'answer': str(answer),
            'label': 1 if is_correct else 0
        })
    
    return data

data = generate_addition_data(10_000)  # Generate 10,000 samples
df = pd.DataFrame(data)
dataset = Dataset.from_pandas(df)

checkpoint = "FacebookAI/roberta-base"

def preprocess_function(examples):
    inputs = [f"Calculate: {q} = {a}" for q, a in zip(examples["question"], examples['answer'])]
    labels = [a for a in examples['label']]
    model_inputs = tokenizer(inputs, max_length=128, padding="max_length", truncation=True, return_tensors='pt')
    model_inputs['labels'] = labels
    return model_inputs


tokenizer = AutoTokenizer.from_pretrained(checkpoint)

tokenized_datasets = dataset.map(preprocess_function, batched=True)


# pos_weights = len(df) / (2 * df.label.value_counts()[1])
# neg_weights = len(df) / (2 * df.label.value_counts()[0])

train_test_split = tokenized_datasets.train_test_split(test_size=2000, seed=2024)

pos_weights = len(train_test_split['train'].to_pandas()) / (2 * train_test_split['train'].to_pandas().label.value_counts()[1])
neg_weights = len(train_test_split['train'].to_pandas()) / (2 * train_test_split['train'].to_pandas().label.value_counts()[0])

print(f"Pos weights: {pos_weights}, Neg weights: {neg_weights}")

config = AutoConfig.from_pretrained(checkpoint)
config.num_labels = 2
model = AutoModelForSequenceClassification.from_config(config)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer, padding=True)

training_args = TrainingArguments(
    output_dir="./tseqclassifier_checkpoints/",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=140,
    per_device_eval_batch_size=140,
    num_train_epochs=200,
    weight_decay=0.01,
    warmup_steps=500,
    # use_cpu=True
    load_best_model_at_end=True,
)

class WeightedCELossTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        # Get model's predictions
        outputs = model(**inputs)
        logits = outputs.get("logits")
        # Compute custom loss
        loss_fct = torch.nn.CrossEntropyLoss(weight=torch.tensor([pos_weights, neg_weights], device=model.device, dtype=logits.dtype))
       # loss_fct = torch.nn.CrossEntropyLoss(pos_weights=torch.tensor([0.9785, 10.24], device=model.device, dtype=logits.dtype))
        loss = loss_fct(logits.view(-1, self.model.config.num_labels), labels.view(-1))
        return (loss, outputs) if return_outputs else loss


trainer = WeightedCELossTrainer(
    model=model,
    args=training_args,
    train_dataset=train_test_split['train'],
    eval_dataset=train_test_split['test'],
    tokenizer = None,
    compute_metrics =compute_metrics,
)

trainer.train()


def predict(question):
    inputs = tokenizer(f"Calculate: {question}", return_tensors="pt").input_ids.cuda()
    outputs = model(inputs)
    print(f"outputs = {outputs.logits.argmax(dim=1).item()}")
    answer = tokenizer.decode(outputs.logits.argmax(dim=1).cpu().numpy(), skip_special_tokens=True)
    return answer

question1 = "15 + 27 = 43"
question2 = "15 + 27 = 42"
print(f"Question: {question1}")
print(f"Predicted Answer: {predict(question1)}")
print(f"Question: {question2}")
print(f"Predicted Answer: {predict(question2)}")