Ensure that all samples in a batch have the same shape in pytorch dataloader

adama · March 26, 2021, 9:33am

My task is to do a multi label classification on a custom dataset with pyTorch and BERT. My data contains about 1500 samples. The amount of words can vary between 1000 and 50k words. Because BERT can only handle a max sequence of 512 I am using a sliding window approach on my data. Please note that a data sample can has several sentences.

For reference I´m working with the example notebooks here and from huggingface.

Here is a minimal version of my script:

    import transformers
    from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
    from transformers import BertTokenizer, BertModel, BertConfig
    import pandas as pd
    import torch
    from torch import cuda
    import math
    import transformers
    from transformers import BertTokenizer, BertModel, BertConfig, AutoTokenizer
    from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
    device = 'cuda' if cuda.is_available() else 'cpu'
    
    MAX_LEN = 400
    STRIDE = 20
    TRAIN_BATCH_SIZE = 8
    VALID_BATCH_SIZE = 4
    EPOCHS = 1
    LEARNING_RATE = 1e-05
    
    model_checkpoint = "bert-base-german-cased"
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, local_files_only=True)
    assert isinstance(tokenizer, transformers.PreTrainedTokenizerFast)

    class CustomDataset(Dataset):
    
        def __init__(self, dataframe, tokenizer, max_len, stride):
            self.tokenizer = tokenizer
            self.data = dataframe
            self.text = dataframe.text
            self.targets = self.data.labels
            self.max_len = max_len
            self.stride = stride
    
        def __len__(self):
            return len(self.text)
    
        def __getitem__(self, index):
            text = str(self.text[index])
            text = " ".join(text.split())
    
            inputs = self.tokenizer(
                text,
                None,
                max_length=MAX_LEN,
                stride=STRIDE,
                padding='max_length',
                truncation='only_first',
                return_overflowing_tokens=True,
            )
            ids = inputs['input_ids']
            mask = inputs['attention_mask']
            token_type_ids = inputs["token_type_ids"]
    
    
            return {
                'ids': torch.tensor(ids, dtype=torch.long),
                'mask': torch.tensor(mask, dtype=torch.long),
                'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
                'targets': torch.tensor(self.targets[index], dtype=torch.float)
            }

I think the sliding window is working because if I run [len(x) for x in inputs["input_ids"]] I get a list of input_ids for my paragraph/text.

    # Creating the dataset and dataloader for the neural network
    train_size = 0.8
    train_dataset=training_frame.sample(frac=train_size,random_state=200)
    test_dataset=training_frame.drop(train_dataset.index).reset_index(drop=True)
    train_dataset = train_dataset.reset_index(drop=True)
    
    print("FULL Dataset: {}".format(training_frame.shape))
    print("TRAIN Dataset: {}".format(train_dataset.shape))
    print("TEST Dataset: {}".format(test_dataset.shape))


    training_set = CustomDataset(train_dataset, tokenizer, MAX_LEN, STRIDE)
    testing_set = CustomDataset(test_dataset, tokenizer, MAX_LEN, STRIDE)
    
    train_params = {'batch_size': TRAIN_BATCH_SIZE,
                    'shuffle': False,
                    'num_workers': 0
                    }
    
    test_params = {'batch_size': VALID_BATCH_SIZE,
                    'shuffle': True,
                    'num_workers': 0
                    }
    
    training_loader = DataLoader(training_set, **train_params)
    testing_loader = DataLoader(testing_set, **test_params)

Until here the script is running without any error, but if I try to iterate over training_set like here:

    train_iter = iter(training_loader)
    print(type(train_iter))
    text, labels = train_iter.next()
    print(text.size())
    print(labels.size())

I get the following error:

  RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 133 and 75 in dimension 1 at /opt/conda/conda-bld/pytorch_1556653215914/work/aten/src/TH/generic/THTensor.cpp:711
  
  Process finished with exit code 1

I found a similiar problem on stackoverflow and the problem was that the loaded batches have different shapes. He suggests to set the batch_size = 1. However I want to use the defined the batch_size defined in my script.

I think the sliding window causes the error because the list of input_ids can vary per sample in my batchsamples because the total length of the text can be different.
How can I ensure that my data fed to the network has always the same shape?