How to figure out corresponding arguments in PeftModel?

aur0307 · August 14, 2023, 8:06am

I am trying to fine tune the llama-2 model from using Peft

model_id = "meta-llama/Llama-2-7b-chat-hf"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=API)
model = AutoModelForSequenceClassification.from_pretrained(model_id, num_labels=3, quantization_config=bnb_config, device_map="auto", token=API)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

config = LoraConfig(
    r=2,
    lora_alpha=2,
    target_modules=[
    "q_proj",
    "up_proj",
    "o_proj",
    "k_proj",
    "down_proj",
    "gate_proj",
    "v_proj"],
    lora_dropout=0.15,
    bias="none"
)

model = get_peft_model(model, config)
data = load_dataset("FinanceInc/auditor_sentiment")

tokenizer.pad_token = tokenizer.eos_token
data = data.map(lambda samples:tokenizer(samples["sentence"], return_tensors='pt', padding=True), batched=True)

data = data.rename_column('label', 'labels')
data

The data:

DatasetDict({
train: Dataset({
features: [‘sentence’, ‘labels’, ‘input_ids’, ‘attention_mask’],
num_rows: 3877
})
test: Dataset({
features: [‘sentence’, ‘labels’, ‘input_ids’, ‘attention_mask’],
num_rows: 969
})
})

Now, the model has arguments ‘input_ids’ and ‘attention_mask’ as the code below generates a completely valid output:

input_ids = torch.tensor(data['train'][0]['input_ids'])
input_ids = torch.unsqueeze(input_ids, 0)
attention_mask = torch.tensor(data['train'][0]['attention_mask'])
attention_mask = torch.unsqueeze(attention_mask, 0)

output = model(input_ids=input_ids, attention_mask=attention_mask)
output

Output

SequenceClassifierOutputWithPast(loss={‘logits’: tensor([[-1.7373, 0.5537, 0.7510]], grad_fn=)}, logits=tensor([[-1.7373, 0.5537, 0.7510]], grad_fn=), past_key_values=None, hidden_states=None, attentions=None)

So I try and use the trainer class

trainer = Trainer(
    model=model,
    train_dataset=data['train'],
    eval_dataset=data['test'],
    args=TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
        optim="paged_adamw_8bit"
    ),
    data_collator=DataCollatorWithPadding(tokenizer=tokenizer),
)
model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()

But I keep getting errors:

Found safetensors installation, but --save_safetensors=False. Safetensors should be a preferred weights saving format due to security and performance reasons. If your model cannot be saved by safetensors please feel free to open an issue at https://github.com/huggingface/safetensors!
PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
The model is quantized. To train this model you need to add additional modules inside the model such as adapters using `peft` library and freeze the model weights. Please check the examples in https://github.com/huggingface/peft for more details.
max_steps is given, it will override any value given in num_train_epochs
The following columns in the training set don't have a corresponding argument in `PeftModel.forward` and have been ignored: sentence, labels, attention_mask, input_ids. If sentence, labels, attention_mask, input_ids are not expected by `PeftModel.forward`,  you can safely ignore this message.
***** Running training *****
  Num examples = 0
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 4
  Gradient Accumulation steps = 4
  Total optimization steps = 10
  Number of trainable parameters = 5,009,408
---------------------------------------------------------------------------
IndexError                                Traceback (most recent call last)
<ipython-input-59-d57a6efd24d9> in <cell line: 19>()
     17 )
     18 model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
---> 19 trainer.train()

11 frames
/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
   1537                 hf_hub_utils.enable_progress_bars()
   1538         else:
-> 1539             return inner_training_loop(
   1540                 args=args,
   1541                 resume_from_checkpoint=resume_from_checkpoint,

/usr/local/lib/python3.10/dist-packages/transformers/trainer.py in _inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
   1797 
   1798             step = -1
-> 1799             for step, inputs in enumerate(epoch_iterator):
   1800                 total_batched_samples += 1
   1801                 if rng_to_sync:

/usr/local/lib/python3.10/dist-packages/accelerate/data_loader.py in __iter__(self)
    382         # We iterate one batch ahead to check when we are at the end
    383         try:
--> 384             current_batch = next(dataloader_iter)
    385         except StopIteration:
    386             yield

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in __next__(self)
    631                 # TODO(https://github.com/pytorch/pytorch/issues/76750)
    632                 self._reset()  # type: ignore[call-arg]
--> 633             data = self._next_data()
    634             self._num_yielded += 1
    635             if self._dataset_kind == _DatasetKind.Iterable and \

/usr/local/lib/python3.10/dist-packages/torch/utils/data/dataloader.py in _next_data(self)
    675     def _next_data(self):
    676         index = self._next_index()  # may raise StopIteration
--> 677         data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
    678         if self._pin_memory:
    679             data = _utils.pin_memory.pin_memory(data, self._pin_memory_device)

/usr/local/lib/python3.10/dist-packages/torch/utils/data/_utils/fetch.py in fetch(self, possibly_batched_index)
     47         if self.auto_collation:
     48             if hasattr(self.dataset, "__getitems__") and self.dataset.__getitems__:
---> 49                 data = self.dataset.__getitems__(possibly_batched_index)
     50             else:
     51                 data = [self.dataset[idx] for idx in possibly_batched_index]

/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in __getitems__(self, keys)
   2805     def __getitems__(self, keys: List) -> List:
   2806         """Can be used to get a batch using a list of integers indices."""
-> 2807         batch = self.__getitem__(keys)
   2808         n_examples = len(batch[next(iter(batch))])
   2809         return [{col: array[i] for col, array in batch.items()} for i in range(n_examples)]

/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in __getitem__(self, key)
   2801     def __getitem__(self, key):  # noqa: F811
   2802         """Can be used to index columns (by string names) or rows (by integer index or iterable of indices or bools)."""
-> 2803         return self._getitem(key)
   2804 
   2805     def __getitems__(self, keys: List) -> List:

/usr/local/lib/python3.10/dist-packages/datasets/arrow_dataset.py in _getitem(self, key, **kwargs)
   2785         format_kwargs = format_kwargs if format_kwargs is not None else {}
   2786         formatter = get_formatter(format_type, features=self._info.features, **format_kwargs)
-> 2787         pa_subtable = query_table(self._data, key, indices=self._indices if self._indices is not None else None)
   2788         formatted_output = format_table(
   2789             pa_subtable, key, formatter=formatter, format_columns=format_columns, output_all_columns=output_all_columns

/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in query_table(table, key, indices)
    581     else:
    582         size = indices.num_rows if indices is not None else table.num_rows
--> 583         _check_valid_index_key(key, size)
    584     # Query the main table
    585     if indices is None:

/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in _check_valid_index_key(key, size)
    534     elif isinstance(key, Iterable):
    535         if len(key) > 0:
--> 536             _check_valid_index_key(int(max(key)), size=size)
    537             _check_valid_index_key(int(min(key)), size=size)
    538     else:

/usr/local/lib/python3.10/dist-packages/datasets/formatting/formatting.py in _check_valid_index_key(key, size)
    524     if isinstance(key, int):
    525         if (key < 0 and key + size < 0) or (key >= size):
--> 526             raise IndexError(f"Invalid key: {key} is out of bounds for size {size}")
    527         return
    528     elif isinstance(key, slice):

IndexError: Invalid key: 2012 is out of bounds for size 0

How do I fix this? How do I find the corresponding arguments in the model’s defined forward method?

aur0307 · August 21, 2023, 9:07am

I believe this is possibly and issue with the huggingface data collator.