Hi, i switched from TF/Keras to PyTorch/Lightning but got this error can anyone help me with this ?
Is there any recommended way to debug some errors in pytorch ?
data example:
Die O
angegebenen O
Werte O
setzen O
voraus O
, O
dass O
sich O
Ihre O
Versicherung O
seit O
dem O
1 B-DATE
. I-DATE
10 I-DATE
. I-DATE
2019 I-DATE
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping
from pytorch_lightning.loggers import TensorBoardLogger
from transformers import BertTokenizer, BertForTokenClassification, AdamW
from sklearn.model_selection import train_test_split
FILE_PATH = 'data/own.txt'
def read_conll_from_txt_to_df(file_path):
df = pd.DataFrame(columns=['SENTENCE', 'TOKEN', 'LABEL'])
sent = 1
with open(file_path, 'r') as f:
for i, line in enumerate(f):
line = line.replace('\n', '')
splitted = line.split()
if not splitted:
sent += 1
else:
df.loc[i] = [sent, splitted[0], splitted[1]]
return df
data = read_conll_from_txt_to_df(FILE_PATH)
class SentenceGetter(object):
def __init__(self, data):
self.n_sent = 1
self.data = data
self.empty = False
agg_func = lambda s: [(token, tag) for token, tag in zip(s["TOKEN"].values.tolist(),
s["LABEL"].values.tolist())]
self.grouped = self.data.groupby("SENTENCE").apply(agg_func)
self.sentences = [s for s in self.grouped]
def get_next(self):
try:
s = self.grouped["SENTENCE: {}".format(self.n_sent)]
self.n_sent += 1
return s
except:
return None
getter = SentenceGetter(data)
tags_vals = list(set(data["LABEL"].values))
tag2idx = {t: i for i, t in enumerate(tags_vals)}
sentences = [' '.join([s[0] for s in sent]) for sent in getter.sentences]
labels = [[s[1] for s in sent] for sent in getter.sentences]
labels = [[tag2idx.get(l) for l in lab] for lab in labels]
##### only overview #####
tags = ["[PAD]"]
tags.extend(list(set(data["LABEL"].values)))
tag2idx = {t: i for i, t in enumerate(tags)}
print('Length of Labels : ' + str(len(tags)))
words = ["[PAD]", "[UNK]"]
words.extend(list(set(data["TOKEN"].values)))
word2idx = {t: i for i, t in enumerate(words)}
print('Length of unique words : ' + str(len(words)))
##### only overview #####
train_sent, test_sent, train_label, test_label = train_test_split(sentences, labels, test_size=0.05)
train_sent, val_sent, train_label, val_label = train_test_split(train_sent, train_label, test_size=0.15)
print('FULL DATASET SENT: ' + str(len(sentences)))
print('FULL DATASET LABELS: ' + str(len(labels)))
print('Train sent size : ' + str(len(train_sent)))
print('Train label size : ' + str(len(train_label)))
print('Test sent size : ' + str(len(test_sent)))
print('Test label size : ' + str(len(test_label)))
print('Val sent size : ' + str(len(val_sent)))
print('Val label size : ' + str(len(val_label)))
class ConllDataset(Dataset):
def __init__(self, tokenizer, sentences, labels, max_len):
self.len = len(sentences)
self.sentences = sentences
self.labels = labels
self.tokenizer = tokenizer
self.max_len = max_len
def __len__(self):
return self.len
def __getitem__(self, index):
sentence = str(self.sentences[index])
inputs = self.tokenizer.encode_plus(
sentence,
None,
add_special_tokens=False,
max_length=self.max_len,
pad_to_max_length=False,
truncation=True,
padding=True,
return_token_type_ids=False,
return_attention_mask=True,
return_tensors='pt'
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
label = self.labels[index]
label.extend([4]*200)
label=label[:200]
return {
'ids': ids,
'mask': mask,
'tags': label
}
class NERConllDataset(pl.LightningDataModule):
def __init__(self, tokenizer, train_sent, train_label, val_sent, val_label, test_sent, test_label, max_len, batch_size):
super().__init__()
self.tokenizer = tokenizer
self.train_sent = train_sent
self.train_label = train_label
self.val_sent = val_sent
self.val_label = val_label
self.test_sent = test_sent
self.test_label = test_label
self.max_len = max_len
self.batch_size = batch_size
def setup(self, stage=None):
self.train_dataset = ConllDataset(
self.tokenizer,
self.train_sent,
self.train_label,
self.max_len,
)
self.val_dataset = ConllDataset(
self.tokenizer,
self.train_sent,
self.train_label,
self.max_len,
)
self.test_dataset = ConllDataset(
self.tokenizer,
self.train_sent,
self.train_label,
self.max_len,
)
def train_dataloader(self):
return DataLoader(
self.train_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=8
)
def val_dataloader(self):
return DataLoader(
self.val_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=8
)
def test_dataloader(self):
return DataLoader(
self.test_dataset,
batch_size=self.batch_size,
shuffle=False,
num_workers=8
)
class NerBertModel(pl.LightningModule):
def __init__(self):
super(NerBertModel, self).__init__()
self.l1 = BertForTokenClassification.from_pretrained('bert-base-german-cased', num_labels=20, return_dict=True)
#self.l2 = torch.nn.Dropout(0.3)
#self.l3 = torch.nn.Linear(768, 200)
def forward(self, ids, mask, labels):
loss, logits = self.l1(ids, mask, labels)
return loss, logits
#output_1= self.l1(ids, mask, labels = labels)
#output_2 = self.l2(output_1[0])
#output_3 = self.l3(output_2)
#return output_1
def training_step(self, batch, batch_idx):
ids = batch["ids"]
mask = batch["mask"]
labels = batch["tags"]
loss, outputs = self(
ids=ids,
mask=mask,
labels=labels
)
self.log("train_loss", loss, prog_bar=True, logger=True)
return loss, outputs
def validation_step(self, batch, batch_idx):
ids = batch["ids"]
mask = batch["mask"]
labels = batch["tags"]
loss, outputs = self(
ids=ids,
mask=mask,
labels=labels
)
self.log("val_loss", loss, prog_bar=True, logger=True)
return loss, outputs
def test_step(self, batch, batch_idx):
ids = batch["ids"]
mask = batch["mask"]
labels = batch["tags"]
loss, outputs = self(
ids=ids,
mask=mask,
labels=labels
)
self.log("test_loss", loss, prog_bar=True, logger=True)
return loss, outputs
def configure_optimizers(self):
return AdamW(self.parameters(), lr=0.0001)
MAX_LEN = 128
BATCH_SIZE = 8
tokenizer = BertTokenizer.from_pretrained('bert-base-german-cased', do_lower_case=False)
data_module = NERConllDataset(tokenizer, train_sent, train_label, val_sent, val_label, test_sent, test_label, max_len=MAX_LEN, batch_size=BATCH_SIZE)
data_module.setup()
model = NerBertModel()
checkpoint_callback = ModelCheckpoint(
dirpath="checkpoints",
filename="best-checkpoint",
save_top_k=1,
verbose=True,
monitor="val_loss",
mode="min"
)
logger = TensorBoardLogger(save_dir="Lightning_logs", name="news_summary")
early_stopping_callback = EarlyStopping(
monitor="val_loss",
min_delta=0.01,
patience=2,
verbose=True
)
trainer = pl.Trainer(logger=logger,
callbacks=[checkpoint_callback, early_stopping_callback],
max_epochs=2,
gpus=0,
progress_bar_refresh_rate=1
)
trainer.fit(model, data_module)
Error:
Length of Labels : 20
Length of unique words : 2440
FULL DATASET SENT: 710
FULL DATASET LABELS: 710
Train sent size : 572
Train label size : 572
Test sent size : 36
Test label size : 36
Val sent size : 102
Val label size : 102
Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-german-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
EarlyStopping mode set to min for monitoring val_loss.
GPU available: False, used: False
TPU available: False, using: 0 TPU cores
| Name | Type | Params
----------------------------------------------------
0 | l1 | BertForTokenClassification | 108 M
----------------------------------------------------
108 M Trainable params
0 Non-trainable params
108 M Total params
434.025 Total estimated model params size (MB)
Validation sanity check: 0%| | 0/2 [00:00<?, ?it/s]Traceback (most recent call last):
File "/home/felix/Desktop/machine-learning/NER_torch/torch_model.py", line 276, in <module>
trainer.fit(model, data_module)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 499, in fit
self.dispatch()
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 546, in dispatch
self.accelerator.start_training(self)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/pytorch_lightning/accelerators/accelerator.py", line 73, in start_training
self.training_type_plugin.start_training(trainer)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 114, in start_training
self._results = trainer.run_train()
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 607, in run_train
self.run_sanity_check(self.lightning_module)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 864, in run_sanity_check
_, eval_results = self.run_evaluation(max_batches=self.num_sanity_val_batches)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 713, in run_evaluation
for batch_idx, batch in enumerate(dataloader):
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 435, in __next__
data = self._next_data()
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1085, in _next_data
return self._process_data(data)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1111, in _process_data
data.reraise()
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/_utils.py", line 428, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in DataLoader worker process 0.
Original Traceback (most recent call last):
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 198, in _worker_loop
data = fetcher.fetch(index)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 47, in fetch
return self.collate_fn(data)
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 73, in default_collate
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 73, in <dictcomp>
return {key: default_collate([d[key] for d in batch]) for key in elem}
File "/home/felix/anaconda3/envs/work/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 55, in default_collate
return torch.stack(batch, 0, out=out)
RuntimeError: stack expects each tensor to be equal size, but got [1, 11] at entry 0 and [1, 41] at entry 1
Thanks
Edit:
after change in Dataset class:
ids = inputs['input_ids'].flatten()
mask = inputs['attention_mask'].flatten()
i get this:
RuntimeError: stack expects each tensor to be equal size, but got [16] at entry 0 and [116] at entry 1
so i think the problem is, that the labels list has a different size ?