Error out of bounds for axis trying basic fine-tune bert model

Ennio_Ferreira · September 10, 2021, 5:01pm

I tried find where could be the size error but without success. The training loop always stuck in the same sample but I chacked all the sizes of DataLoader and all with 512 (ids, token_types_ids, mask) and the target with 30.

The dataset is from Google QUEST Q&A Labeling | Kaggle competition.
Train dataframe: 6079 samples

import torch
import torch.nn as nn
import transformers
import numpy as np
import pandas as pd
from sklearn import model_selection
from scipy import stats
from transformers.utils.dummy_pt_objects import get_linear_schedule_with_warmup
from transformers import AdamW

class BERTBasedUncased(nn.Module):
    def __init__(self):
        super(BERTBasedUncased, self).__init__()
        #self.bert_path = bert_path
        self.bert = transformers.BertModel.from_pretrained("../input/bert-base-uncased")
        self.bert_drop = nn.Dropout(0.3)
        self.out = nn.Linear(768, 30)

    def forward(self, ids, mask, token_type_ids):
        _, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
        bo = self.bert_drop(o2)
        return self.out(bo)

class BERTDatasetTraining:
    def __init__(self, qtitle, qbody, answer, targets, tokenizer, max_len):
        self.qtitle = qtitle
        self.qbody = qbody
        self.answer = answer
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.targets = targets

    def __len__(self):
        return len(self.answer)

    def __getitem__(self, item):
        question_title = str(self.qtitle[item])
        question_body = str(self.qbody[item])
        answer = str(self.answer[item])

        inputs = self.tokenizer.encode_plus(
            question_title + " " + question_body,
            answer,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length'
        )

        ids = inputs["input_ids"]
        token_type_ids = inputs["token_type_ids"]
        mask = inputs["attention_mask"]

        #padding_len = self.max_len - len(ids)
        #ids = ids + ([0] * padding_len)
        #token_type_ids = token_type_ids + ([0] * padding_len)
        #mask = mask + ([0] * padding_len)

        return {
            "ids" : torch.tensor(ids, dtype=torch.long),
            "mask" : torch.tensor(mask, dtype=torch.long),
            "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
            "targets" : torch.tensor(self.targets[item, :], dtype=torch.float)
        }

def loss_fn(outputs, targets):
    return nn.BCEWithLogitsLoss()(outputs, targets)

def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
    model.train()
    for idx_data, item in enumerate(data_loader):
        ids = item['ids']
        mask = item['mask']
        token_type_ids = item['token_type_ids']
        targets = item['targets']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        if scheduler is not None:
            scheduler.step()
        if idx_data % 10 ==0:
            print(f'idx = {idx_data} ,loss = {loss}')

def eval_loop_fn(data_loader, model, device):
    model.eval()
    fin_targets = []
    fin_outputs = []
    for _, item in enumerate(data_loader):
        ids = item['ids']
        mask = item['mask']
        token_type_ids = item['token_type_ids']
        targets = item['targets']

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        token_type_ids = token_type_ids.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        outputs = model(ids, mask, token_type_ids)
        loss = loss_fn(outputs, targets)

        fin_targets.append(targets.cpu().detach().numpy())
        fin_outputs.append(outputs.cpu().detach().numpy())
    return np.vstack(fin_outputs), np.vstack(fin_targets)

def run():
    MAX_LEN = 512
    TRAIN_BATCH_SIZE = 4
    EPOCHS = 20
    FINE_MODEL_PATH = '../output/fine_tuned_model'
    
    dfx = pd.read_csv("../input/google-quest-challenge/train.csv").fillna("none")
    df_train, df_valid = model_selection.train_test_split(dfx, random_state=42, test_size=0.1)
    df_train = df_train.reset_index(drop=True)
    df_valid = df_valid.reset_index(drop=True)

    sample = pd.read_csv('../input/google-quest-challenge/sample_submission.csv')
    target_cols = list(sample.drop("qa_id", axis=1).columns)
    train_targets = df_train[target_cols].values
    valid_targets = df_valid[target_cols].values

    tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")



    train_dataset = BERTDatasetTraining(
        qtitle = df_train['question_title'].values,
        qbody = df_train['question_body'].values,
        answer = df_train['answer'].values,
        targets = train_targets,
        tokenizer = tokenizer,
        max_len = MAX_LEN
    )

    train_data_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=TRAIN_BATCH_SIZE,
        shuffle=True
    )

    valid_dataset = BERTDatasetTraining(
        qtitle = df_train['question_title'].values,
        qbody = df_train['question_body'].values,
        answer = df_train['answer'].values,
        targets = valid_targets,
        tokenizer = tokenizer,
        max_len = MAX_LEN
    )

    valid_data_loader = torch.utils.data.DataLoader(
        valid_dataset,
        batch_size=4,
        shuffle=False
    )

    device = "cuda"
    lr = 3e-5
    num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)


    model = BERTBasedUncased().to(device)
    optimizer = AdamW(model.parameters(), lr=lr)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=num_train_steps
    )
    for epoch in range(EPOCHS):
        train_loop_fn(data_loader = train_data_loader,
                      model = model,
                      optimizer = optimizer,
                      device = device,
                      scheduler = scheduler)
        
        o, t = eval_loop_fn(data_loader = valid_data_loader, model = model, device = device)
        
        spear = []
        for response_idx in range(t.shape[1]):
            p1 = list(t[:, response_idx])
            p2 = list(o[:, response_idx])
            coef, _ = np.nan_to_num(stats.spearmanr(p1, p2))
            spear.append(coef)
        spear = np.mean(spear)
        print(f"spoch = {epoch}, spearman = {spear}")
        torch.save(model.state_dict(), FINE_MODEL_PATH)

---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
/tmp/ipykernel_40/766504495.py in <module>
----> 1 run()

/tmp/ipykernel_40/2188526498.py in run()
    181                       scheduler = scheduler)
    182 
--> 183         o, t = eval_loop_fn(data_loader = valid_data_loader, model = model, device = device)
    184 
    185         spear = []

/tmp/ipykernel_40/2188526498.py in eval_loop_fn(data_loader, model, device)
     94     fin_targets = []
     95     fin_outputs = []
---> 96     for _, item in enumerate(data_loader):
     97         ids = item['ids']
     98         mask = item['mask']

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
    433         if self._sampler_iter is None:
    434             self._reset()
--> 435         data = self._next_data()
    436         self._num_yielded += 1
    437         if self._dataset_kind == _DatasetKind.Iterable and \

/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
    473     def _next_data(self):
    474         index = self._next_index()  # may raise StopIteration
--> 475         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    476         if self._pin_memory:
    477             data = _utils.pin_memory.pin_memory(data)

/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
     42     def fetch(self, possibly_batched_index):
     43         if self.auto_collation:
---> 44             data = [self.dataset[idx] for idx in possibly_batched_index]
     45         else:
     46             data = self.dataset[possibly_batched_index]

/tmp/ipykernel_40/2188526498.py in __getitem__(self, item)
     61             "mask" : torch.tensor(mask, dtype=torch.long),
     62             "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
---> 63             "targets" : torch.tensor(self.targets[item, :], dtype=torch.float)
     64         }
     65 

IndexError: index 608 is out of bounds for axis 0 with size 608

ptrblck · September 12, 2021, 5:40am

Based on the error message I would guess that self.targets[item, :] raises the error where item is 608. Did you try to index your Dataset (not DataLoader) with this index and are you seeing the same error? If so, I would guess that the returned length of the Dataset via __len__ is wrong and yields an invalid length.