My task is to do a multi label classification on a custom dataset with pyTorch and BERT. My data contains about 1500 samples. The amount of words can vary between 1000 and 50k words. Because BERT can only handle a max sequence of 512 I am using a sliding window approach on my data. Please note that a data sample can has several sentences.
For reference I´m working with the example notebooks here and from huggingface.
Here is a minimal version of my script:
import transformers
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig
import pandas as pd
import torch
from torch import cuda
import math
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
device = 'cuda' if cuda.is_available() else 'cpu'
MAX_LEN = 400
STRIDE = 20
TRAIN_BATCH_SIZE = 8
VALID_BATCH_SIZE = 4
EPOCHS = 1
LEARNING_RATE = 1e-05
model_checkpoint = "bert-base-german-cased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, local_files_only=True)
assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)
class CustomDataset(Dataset):
def __init__(self, dataframe, tokenizer, max_len, stride):
self.tokenizer = tokenizer
self.data = dataframe
self.text = dataframe.text
self.targets = self.data.labels
self.max_len = max_len
self.stride = stride
def __len__(self):
return len(self.text)
def __getitem__(self, index):
text = str(self.text[index])
text = " ".join(text.split())
inputs = self.tokenizer(
text,
None,
max_length=MAX_LEN,
stride=STRIDE,
padding='max_length',
truncation='only_first',
return_overflowing_tokens=True,
)
ids = inputs['input_ids']
mask = inputs['attention_mask']
token_type_ids = inputs["token_type_ids"]
return {
'ids': torch.tensor(ids, dtype=torch.long),
'mask': torch.tensor(mask, dtype=torch.long),
'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
'targets': torch.tensor(self.targets[index], dtype=torch.float)
}
I think the sliding window is working because if I run [len(x) for x in inputs["input_ids"]]
I get a list of input_ids for my paragraph/text.
# Creating the dataset and dataloader for the neural network
train_size = 0.8
train_dataset=training_frame.sample(frac=train_size,random_state=200)
test_dataset=training_frame.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)
print("FULL Dataset: {}".format(training_frame.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))
training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, STRIDE)
testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN, STRIDE)
train_params = {'batch_size': TRAIN_BATCH_SIZE,
'shuffle': False,
'num_workers': 0
}
test_params = {'batch_size': VALID_BATCH_SIZE,
'shuffle': True,
'num_workers': 0
}
training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)
Until here the script is running without any error, but if I try to iterate over training_set
like here:
train_iter = iter(training_loader)
print(type(train_iter))
text, labels = train_iter.next()
print(text.size())
print(labels.size())
I get the following error:
RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 133 and 75 in dimension 1 at /opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/TH/generic/THTensor.cpp:711
Process finished with exit code 1
I found a similiar problem on stackoverflow and the problem was that the loaded batches have different shapes. He suggests to set the batch_size = 1
. However I want to use the defined the batch_size
defined in my script.
I think the sliding window causes the error because the list of input_ids
can vary per sample in my batchsamples because the total length of the text can be different.
How can I ensure that my data fed to the network has always the same shape?