Can you spot why loss isn't decreasing for Q&A task?

Hi everyone. I have the following training loop for the question-answering task with SQuAD dataset.

transformers.utils.logging.set_verbosity_error()
device = "cuda" if torch.cuda.is_available() else "cpu"

LEARNING_RATE = 1e-3
BATCH_SIZE = 16
EPOCHS = 3
DATA_PATH = "./SQuAD.json"
MODEL_PATH = "bert-base-cased"
MODEL_SAVE_PATH = f"./{MODEL_PATH}-lr{LEARNING_RATE}-epochs{EPOCHS}-batchsize{BATCH_SIZE}/"

tokenizer = BertTokenizerFast.from_pretrained(MODEL_PATH)
model = BertForQuestionAnswering.from_pretrained(MODEL_PATH).to(device)
    
dataset = SquadDataset(DATA_PATH, tokenizer)

generator = torch.Generator().manual_seed(42)
train_dataset, val_dataset, test_dataset = random_split(dataset, [0.8, 0.1, 0.1], generator=generator)

train_dataloader = DataLoader(dataset=train_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=True)
val_dataloader = DataLoader(dataset=val_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=True)
test_dataloader = DataLoader(dataset=test_dataset,
                                batch_size=BATCH_SIZE,
                                shuffle=True)
optimizer = optim.AdamW(model.parameters(), lr=LEARNING_RATE)

for epoch in tqdm(range(EPOCHS)):
        model.train()
        train_running_loss = 0
        for idx, sample in enumerate(tqdm(train_dataloader)):
            input_ids = sample['input_ids'].to(device)
            attention_mask = sample['attention_mask'].to(device)
            start_positions = sample['start_positions'].to(device)
            end_positions = sample['end_positions'].to(device)
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)
           
            loss = outputs.loss
            train_running_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        train_loss = train_running_loss / (idx + 1)

        model.eval()
        val_running_loss = 0
        with torch.no_grad():
            for idx, sample in enumerate(tqdm(val_dataloader)):
                input_ids = sample['input_ids'].to(device)
                attention_mask = sample['attention_mask'].to(device)
                start_positions = sample['start_positions'].to(device)
                end_positions = sample['end_positions'].to(device)
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, start_positions=start_positions, end_positions=end_positions)

                val_running_loss += outputs[0].item()
            val_loss = val_running_loss / (idx + 1)

        print("-"*30)
        print(f"Train Loss EPOCH {epoch+1}: {train_loss:.4f}")
        print(f"Valid Loss EPOCH {epoch+1}: {val_loss:.4f}")
        print("-"*30)

The dataset is:

import torch
from torch.utils.data import Dataset
import json

class SquadDataset(Dataset):
    def __init__(self, data_path, tokenizer):
        contexts, questions, answers = self.read_data(data_path)
        answers = self.add_end_idx(contexts, answers)
        
        encodings = tokenizer(contexts, questions, padding=True, truncation=True)
        self.encodings = self.update_start_end_positions(encodings, answers, tokenizer)

    def __getitem__(self, idx):
        return {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}

    def __len__(self):
        return len(self.encodings.input_ids)

    def read_data(self, path):
        with open(path, 'rb') as f:
            squad = json.load(f)

        contexts = []
        questions = []
        answers = []

        for group in squad['data']:
            for parag in group['paragraphs']:
                context = parag['context']
                for qa in parag['qas']:
                    question = qa['question']
                    for answer in qa['answers']:
                        contexts.append(context)
                        questions.append(question)
                        answers.append(answer)
        return contexts[:5000], questions[:5000], answers[:5000]

    def add_end_idx(self, contexts, answers):
        for answer, context in zip(answers, contexts):
            gold_text = answer["text"]
            start_idx = answer["answer_start"]

            end_idx = start_idx + len(gold_text)

            if context[start_idx:end_idx] == gold_text:
                answer["answer_end"] = end_idx
            elif context[start_idx-1:end_idx-1] == gold_text:
                answer["answer_start"] = start_idx - 1
                answer["answer_end"] = end_idx - 1
            elif context[start_idx-2:end_idx-2] == gold_text:
                answer["answer_start"] = start_idx - 2
                answer["answer_end"] = end_idx - 2
        return answers

    def update_start_end_positions(self, encodings, answers, tokenizer):
        start_positions = []
        end_positions = []
        for i in range(len(answers)):
            start_positions.append(encodings.char_to_token(i, answers[i]["answer_start"]))
            end_positions.append(encodings.char_to_token(i, answers[i]["answer_end"]-1))
            if start_positions[-1] is None:
                start_positions[-1] = tokenizer.model_max_length
            if end_positions[-1] is None:
                end_positions[-1] = tokenizer.model_max_length
        encodings["start_positions"] = start_positions
        encodings["end_positions"] = end_positions

        return encodings

With hyperparameters:

LEARNING_RATE = 1e-3
BATCH_SIZE = 16
EPOCHS = 3

However loss values stay the same more or less. It’s like training is not happening.

Train Loss EPOCH 1: 6.1635
Valid Loss EPOCH 1: 6.2383

Train Loss EPOCH 2: 6.2505
Valid Loss EPOCH 2: 6.2383

Train Loss EPOCH 3: 6.2510
Valid Loss EPOCH 3: 6.2383

Why is that? Can you spot what is wrong here?

EDIT: Updated the full code.

I don’t see any obvious issues in your code. Try to overfit a small dataset, e.g. just 10 samples, by playing around with your training hyperparameter and make sure your model is able to learn these.

What model are you using? Your own implementation? Can you post the code?

I’m using bert-base-cased from transformers. I updated the full code. Thanks :slight_smile:

Tried it with several subsets. No luck `:p Always losses are close to each other and updating only slightly.