ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['pixel_values']

I want to fine tune a VisionEncoderDecoderModel.from_pretrained(model_name)
I use a CustomOCRDataset from Learn Open CV.
But the default_data_collator fails to stack the inputs because the samples have a different shape , so I decided to use DataCollatorForSeq2Seq and Resize in augmentation.

I get an error
ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘pixel_values’]

So I changed getitem to get input_ids, but the error is still the same.

def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]

        assert text.strip() != "", f"ERROR Empty text in {idx}"

        # Read the image, apply augmentations, and get the transformed pixels.
        image = Image.open(self.root_dir + file_name).convert('RGB')
        image = train_transforms(image)
        pixel_values = self.processor(image, return_tensors='pt').pixel_values
        # Pass the text through the tokenizer and get the labels,
        # i.e. tokenized labels.
        labels = self.processor.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_target_length,
            return_tensors='pt'
        ).input_ids.squeeze(0)

        # We are using -100 as the padding token.
        labels = torch.where(labels == self.processor.tokenizer.pad_token_id, torch.tensor(-100), labels)
        encoding = {"pixel_values": pixel_values.squeeze(0),
                    "input_ids": labels}
        return encoding
@dataclass(frozen=True)
class TrainingConfig:
    BATCH_SIZE:    int = 16
    EPOCHS:        int = 5
    LEARNING_RATE: float = 0.00005

@dataclass(frozen=True)
class DatasetConfig:
    DATA_ROOT:     str = image_dir

@dataclass(frozen=True)
class ModelConfig:
    MODEL_NAME: str = 'microsoft/trocr-base-handwritten'
# Augmentations.
train_transforms = transforms.Compose([
    transforms.Resize((1024, 880))
])
processor = TrOCRProcessor.from_pretrained(ModelConfig.MODEL_NAME)
train_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, train_destination),
    df=train_df,
    processor=processor
)
valid_dataset = CustomOCRDataset(
    root_dir=os.path.join(DatasetConfig.DATA_ROOT, test_destination),
    df=test_df,
    processor=processor
)
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy='epoch',
    per_device_train_batch_size=TrainingConfig.BATCH_SIZE,
    per_device_eval_batch_size=TrainingConfig.BATCH_SIZE,
    fp16=True,
    output_dir='seq2seq_model_printed/',
    logging_strategy='epoch',
    save_strategy='epoch',
    save_total_limit=5,
    report_to='tensorboard',
    num_train_epochs=TrainingConfig.EPOCHS
)
data_collator = DataCollatorForSeq2Seq(tokenizer=processor.tokenizer, model=model, padding=True)
# Initialize trainer.
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.feature_extractor,
    args=training_args,
    compute_metrics=compute_cer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=data_collator
)
trainer.train()

The full ERROR :

File \transformers\data\data_collator.py:599, in DataCollatorForSeq2Seq.__call__(self, features, return_tensors)
    596 non_labels_features = [{k: v for k, v in feature.items() if k != label_name} for feature in features]
    598 # run through tokenizer without labels to ensure no side effects
--> 599 batch = pad_without_fast_tokenizer_warning(
    600     self.tokenizer,
    601     non_labels_features,
    602     padding=self.padding,
    603     max_length=self.max_length,
    604     pad_to_multiple_of=self.pad_to_multiple_of,
    605     return_tensors=return_tensors,
    606 )
    608 # we have to pad the labels manually as we cannot rely on `tokenizer.pad` and we need them to be of the same length to return tensors
    609 no_padding = self.padding is False or self.padding == PaddingStrategy.DO_NOT_PAD

File \transformers\data\data_collator.py:66, in pad_without_fast_tokenizer_warning(tokenizer, *pad_args, **pad_kwargs)
     63 tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
     65 try:
---> 66     padded = tokenizer.pad(*pad_args, **pad_kwargs)
     67 finally:
     68     # Restore the state of the warning.
     69     tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = warning_state

File \transformers\tokenization_utils_base.py:3305, in PreTrainedTokenizerBase.pad(self, encoded_inputs, padding, max_length, pad_to_multiple_of, padding_side, return_attention_mask, return_tensors, verbose)
   3303 # The model's main input name, usually `input_ids`, has been passed for padding
   3304 if self.model_input_names[0] not in encoded_inputs:
-> 3305     raise ValueError(
   3306         "You should supply an encoding or a list of encodings to this method "
   3307         f"that includes {self.model_input_names[0]}, but you provided {list(encoded_inputs.keys())}"
   3308     )
   3310 required_input = encoded_inputs[self.model_input_names[0]]
   3312 if required_input is None or (isinstance(required_input, Sized) and len(required_input) == 0):

ValueError: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided ['pixel_values']

The solutions that I’ve already tried were:
→ to do transforms.Resize((1024, 880)),
→ to use a custom data collator
I got a TypeError: ViTModel.forward() got an unexpected keyword argument ‘num_items_in_batch’. That’s why I decided to use DataCollatorForSeq2Seq.

But I’m getting another error over and over: You should supply an encoding or a list of encodings to this method that includes input_ids, but you provided [‘pixel_values’].
Thanks in advance!

Dear Guru, @ptrblck, does this error make sense for you?

I’m not familiar enough with transformers, but the error is raised from here.
Based on the error message it seems self.model_input_names[0] is compared against the 'pixel_values' tag, so you might need to change the model_input_names (if that’s possible).

@ptrblck , I decided to use a default_data_collator instead of DataCollatorForSeq2Seq. Because of the bullinger code that must work well. But I don’t get this error:

class CustomOCRDataset(Dataset):
    def __init__(self, root_dir, df, processor, max_target_length=512):
        self.root_dir = root_dir
        self.df = df
        self.processor = processor
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        file_name = self.df['file_name'][idx]
        text = self.df['text'][idx]

        assert text.strip() != "", f"ERROR Empty text in {idx}"

        # Read the image, apply augmentations, and get the transformed pixels.
        image = Image.open(self.root_dir + file_name).convert('RGB')
        image = train_transforms(image)
        pixel_values = self.processor(image, return_tensors='pt').pixel_values
        # Pass the text through the tokenizer and get the labels,
        # i.e. tokenized labels.
        labels = self.processor.tokenizer(
            text,
            padding='max_length',
            max_length=self.max_target_length
        ).input_ids

        # We are using -100 as the padding token.
        labels = [label if label != self.processor.tokenizer.pad_token_id else -100 for label in labels]

        encoding = {"pixel_values": pixel_values.squeeze(), "labels": torch.tensor(labels)}
        return encoding

model = VisionEncoderDecoderModel.from_pretrained("microsoft/trocr-base-handwritten").to(device)
processor = TrOCRProcessor.from_pretrained("microsoft/trocr-base-handwritten")
# Set special tokens used for creating the decoder_input_ids from the labels.
model.config.decoder_start_token_id = processor.tokenizer.cls_token_id
model.config.pad_token_id = processor.tokenizer.pad_token_id
# Set Correct vocab size.
model.config.vocab_size = model.config.decoder.vocab_size

model.config.eos_token_id = processor.tokenizer.sep_token_id
model.config.max_length = 64
model.config.early_stopping = True
model.config.no_repeat_ngram_size = 3
model.config.length_penalty = 2.0
model.config.num_beams = 4

cer_metric = evaluate.load('cer')

def compute_cer(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = processor.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = processor.tokenizer.pad_token_id
    label_str = processor.batch_decode(labels_ids, skip_special_tokens=True)

    cer = cer_metric.compute(predictions=pred_str, references=label_str)

    return {"cer": cer}

training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=TrainingConfig.BATCH_SIZE,
    per_device_eval_batch_size=TrainingConfig.BATCH_SIZE,
    fp16=True,
    output_dir='seq2seq_model_printed/',
    logging_steps=2,
    save_steps=int(len(train_dataset)/8),
    eval_steps=int(len(valid_dataset)/8),
    optim='adafactor',
    warmup_steps=int((len(valid_dataset)/8)),
    num_train_epochs=TrainingConfig.EPOCHS
)

# Initialize trainer.
trainer = Seq2SeqTrainer(
    model=model,
    tokenizer=processor.image_processor,
    args=training_args,
    compute_metrics=compute_cer,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    data_collator=default_data_collator
)
res = trainer.train()

Ouputs:
TypeError: ViTModel.forward() got an unexpected keyword argument ‘num_items_in_batch’

File \Python310\lib\site-packages\transformers\models\vision_encoder_decoder\modeling_vision_encoder_decoder.py:592, in VisionEncoderDecoderModel.forward(self, pixel_values, decoder_input_ids, decoder_attention_mask, encoder_outputs, past_key_values, decoder_inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, **kwargs)
    589     if pixel_values is None:
    590         raise ValueError("You have to specify pixel_values")
--> 592     encoder_outputs = self.encoder(
    593         pixel_values=pixel_values,
    594         output_attentions=output_attentions,
    595         output_hidden_states=output_hidden_states,
    596         return_dict=return_dict,
    597         **kwargs_encoder,
    598     )
    599 elif isinstance(encoder_outputs, tuple):
    600     encoder_outputs = BaseModelOutput(*encoder_outputs)

File \Python310\lib\site-packages\torch\nn\modules\module.py:1501, in Module._call_impl(self, *args, **kwargs)
   1496 # If we don't have any hooks, we want to skip the rest of the logic in
   1497 # this function, and just call forward.
   1498 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
   1499         or _global_backward_pre_hooks or _global_backward_hooks
   1500         or _global_forward_hooks or _global_forward_pre_hooks):
-> 1501     return forward_call(*args, **kwargs)
   1502 # Do not call functions when jit is used
   1503 full_backward_hooks, non_full_backward_hooks = [], []

I downgraded to transformers==4.45.2 as people recommend, but the issue is still persisting.