I tried find where could be the size error but without success. The training loop always stuck in the same sample but I chacked all the sizes of DataLoader and all with 512 (ids, token_types_ids, mask) and the target with 30.
The dataset is from Google QUEST Q&A Labeling | Kaggle competition.
Train dataframe: 6079 samples
import torch
import torch.nn as nn
import transformers
import numpy as np
import pandas as pd
from sklearn import model_selection
from scipy import stats
from transformers.utils.dummy_pt_objects import get_linear_schedule_with_warmup
from transformers import AdamW
class BERTBasedUncased(nn.Module):
def __init__(self):
super(BERTBasedUncased, self).__init__()
#self.bert_path = bert_path
self.bert = transformers.BertModel.from_pretrained("../input/bert-base-uncased")
self.bert_drop = nn.Dropout(0.3)
self.out = nn.Linear(768, 30)
def forward(self, ids, mask, token_type_ids):
_, o2 = self.bert(ids, attention_mask=mask, token_type_ids=token_type_ids, return_dict=False)
bo = self.bert_drop(o2)
return self.out(bo)
class BERTDatasetTraining:
def __init__(self, qtitle, qbody, answer, targets, tokenizer, max_len):
self.qtitle = qtitle
self.qbody = qbody
self.answer = answer
self.tokenizer = tokenizer
self.max_len = max_len
self.targets = targets
def __len__(self):
return len(self.answer)
def __getitem__(self, item):
question_title = str(self.qtitle[item])
question_body = str(self.qbody[item])
answer = str(self.answer[item])
inputs = self.tokenizer.encode_plus(
question_title + " " + question_body,
answer,
add_special_tokens=True,
max_length=self.max_len,
truncation=True,
padding='max_length'
)
ids = inputs["input_ids"]
token_type_ids = inputs["token_type_ids"]
mask = inputs["attention_mask"]
#padding_len = self.max_len - len(ids)
#ids = ids + ([0] * padding_len)
#token_type_ids = token_type_ids + ([0] * padding_len)
#mask = mask + ([0] * padding_len)
return {
"ids" : torch.tensor(ids, dtype=torch.long),
"mask" : torch.tensor(mask, dtype=torch.long),
"token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
"targets" : torch.tensor(self.targets[item, :], dtype=torch.float)
}
def loss_fn(outputs, targets):
return nn.BCEWithLogitsLoss()(outputs, targets)
def train_loop_fn(data_loader, model, optimizer, device, scheduler=None):
model.train()
for idx_data, item in enumerate(data_loader):
ids = item['ids']
mask = item['mask']
token_type_ids = item['token_type_ids']
targets = item['targets']
ids = ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.float)
optimizer.zero_grad()
outputs = model(ids, mask, token_type_ids)
loss = loss_fn(outputs, targets)
loss.backward()
optimizer.step()
if scheduler is not None:
scheduler.step()
if idx_data % 10 ==0:
print(f'idx = {idx_data} ,loss = {loss}')
def eval_loop_fn(data_loader, model, device):
model.eval()
fin_targets = []
fin_outputs = []
for _, item in enumerate(data_loader):
ids = item['ids']
mask = item['mask']
token_type_ids = item['token_type_ids']
targets = item['targets']
ids = ids.to(device, dtype=torch.long)
mask = mask.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
targets = targets.to(device, dtype=torch.float)
outputs = model(ids, mask, token_type_ids)
loss = loss_fn(outputs, targets)
fin_targets.append(targets.cpu().detach().numpy())
fin_outputs.append(outputs.cpu().detach().numpy())
return np.vstack(fin_outputs), np.vstack(fin_targets)
def run():
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
EPOCHS = 20
FINE_MODEL_PATH = '../output/fine_tuned_model'
dfx = pd.read_csv("../input/google-quest-challenge/train.csv").fillna("none")
df_train, df_valid = model_selection.train_test_split(dfx, random_state=42, test_size=0.1)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)
sample = pd.read_csv('../input/google-quest-challenge/sample_submission.csv')
target_cols = list(sample.drop("qa_id", axis=1).columns)
train_targets = df_train[target_cols].values
valid_targets = df_valid[target_cols].values
tokenizer = transformers.BertTokenizer.from_pretrained("../input/bert-base-uncased")
train_dataset = BERTDatasetTraining(
qtitle = df_train['question_title'].values,
qbody = df_train['question_body'].values,
answer = df_train['answer'].values,
targets = train_targets,
tokenizer = tokenizer,
max_len = MAX_LEN
)
train_data_loader = torch.utils.data.DataLoader(
train_dataset,
batch_size=TRAIN_BATCH_SIZE,
shuffle=True
)
valid_dataset = BERTDatasetTraining(
qtitle = df_train['question_title'].values,
qbody = df_train['question_body'].values,
answer = df_train['answer'].values,
targets = valid_targets,
tokenizer = tokenizer,
max_len = MAX_LEN
)
valid_data_loader = torch.utils.data.DataLoader(
valid_dataset,
batch_size=4,
shuffle=False
)
device = "cuda"
lr = 3e-5
num_train_steps = int(len(train_dataset) / TRAIN_BATCH_SIZE * EPOCHS)
model = BERTBasedUncased().to(device)
optimizer = AdamW(model.parameters(), lr=lr)
scheduler = get_linear_schedule_with_warmup(
optimizer,
num_warmup_steps=0,
num_training_steps=num_train_steps
)
for epoch in range(EPOCHS):
train_loop_fn(data_loader = train_data_loader,
model = model,
optimizer = optimizer,
device = device,
scheduler = scheduler)
o, t = eval_loop_fn(data_loader = valid_data_loader, model = model, device = device)
spear = []
for response_idx in range(t.shape[1]):
p1 = list(t[:, response_idx])
p2 = list(o[:, response_idx])
coef, _ = np.nan_to_num(stats.spearmanr(p1, p2))
spear.append(coef)
spear = np.mean(spear)
print(f"spoch = {epoch}, spearman = {spear}")
torch.save(model.state_dict(), FINE_MODEL_PATH)
---------------------------------------------------------------------------
IndexError Traceback (most recent call last)
/tmp/ipykernel_40/766504495.py in <module>
----> 1 run()
/tmp/ipykernel_40/2188526498.py in run()
181 scheduler = scheduler)
182
--> 183 o, t = eval_loop_fn(data_loader = valid_data_loader, model = model, device = device)
184
185 spear = []
/tmp/ipykernel_40/2188526498.py in eval_loop_fn(data_loader, model, device)
94 fin_targets = []
95 fin_outputs = []
---> 96 for _, item in enumerate(data_loader):
97 ids = item['ids']
98 mask = item['mask']
/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in __next__(self)
433 if self._sampler_iter is None:
434 self._reset()
--> 435 data = self._next_data()
436 self._num_yielded += 1
437 if self._dataset_kind == _DatasetKind.Iterable and \
/opt/conda/lib/python3.7/site-packages/torch/utils/data/dataloader.py in _next_data(self)
473 def _next_data(self):
474 index = self._next_index() # may raise StopIteration
--> 475 data = self._dataset_fetcher.fetch(index) # may raise StopIteration
476 if self._pin_memory:
477 data = _utils.pin_memory.pin_memory(data)
/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
/opt/conda/lib/python3.7/site-packages/torch/utils/data/_utils/fetch.py in <listcomp>(.0)
42 def fetch(self, possibly_batched_index):
43 if self.auto_collation:
---> 44 data = [self.dataset[idx] for idx in possibly_batched_index]
45 else:
46 data = self.dataset[possibly_batched_index]
/tmp/ipykernel_40/2188526498.py in __getitem__(self, item)
61 "mask" : torch.tensor(mask, dtype=torch.long),
62 "token_type_ids" : torch.tensor(token_type_ids, dtype=torch.long),
---> 63 "targets" : torch.tensor(self.targets[item, :], dtype=torch.float)
64 }
65
IndexError: index 608 is out of bounds for axis 0 with size 608