I’m using a Single Node
machine with g5.2xlarge
to fine tune a LLaMa-2 model. My Come Notebook runs very smoothly on Colab but when I try to run it on Databricks
, it throws me the exact error given below:
RuntimeError Traceback (most recent call last)
File <command-65240085594578>, line 1
----> 1 fine_tuning_trainer.train()
File /databricks/python/lib/python3.10/site-packages/mlflow/utils/autologging_utils/safety.py:451, in safe_patch.<locals>.safe_patch_function(*args, **kwargs)
436 if (
437 active_session_failed
438 or autologging_is_disabled(autologging_integration)
(...)
445 # warning behavior during original function execution, since autologging is being
446 # skipped
447 with set_non_mlflow_warnings_behavior_for_current_thread(
448 disable_warnings=False,
449 reroute_warnings=False,
450 ):
--> 451 return original(*args, **kwargs)
453 # Whether or not the original / underlying function has been called during the
454 # execution of patched code
455 original_has_been_called = False
File /databricks/python_shell/dbruntime/huggingface_patches/transformers.py:54, in _create_patch_function.<locals>.patched_fit_function(self, *args, **kwargs)
52 call_succeeded = False
53 try:
---> 54 model = original_method(self, *args, **kwargs)
55 call_succeeded = True
56 return model
File /databricks/python/lib/python3.10/site-packages/transformers/trainer.py:1539, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
1534 self.model_wrapped = self.model
1536 inner_training_loop = find_executable_batch_size(
1537 self._inner_training_loop, self._train_batch_size, args.auto_find_batch_size
1538 )
-> 1539 return inner_training_loop(
1540 args=args,
1541 resume_from_checkpoint=resume_from_checkpoint,
1542 trial=trial,
1543 ignore_keys_for_eval=ignore_keys_for_eval,
1544 )
File /databricks/python/lib/python3.10/site-packages/transformers/trainer.py:1809, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
1806 self.control = self.callback_handler.on_step_begin(args, self.state, self.control)
1808 with self.accelerator.accumulate(model):
-> 1809 tr_loss_step = self.training_step(model, inputs)
1811 if (
1812 args.logging_nan_inf_filter
1813 and not is_torch_tpu_available()
1814 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
1815 ):
1816 # if loss is nan or inf simply add the average of previous logged losses
1817 tr_loss += tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File /databricks/python/lib/python3.10/site-packages/transformers/trainer.py:2665, in Trainer.training_step(self, model, inputs)
2663 scaled_loss.backward()
2664 else:
-> 2665 self.accelerator.backward(loss)
2667 return loss.detach() / self.args.gradient_accumulation_steps
File /databricks/python/lib/python3.10/site-packages/accelerate/accelerator.py:1853, in Accelerator.backward(self, loss, **kwargs)
1851 self.scaler.scale(loss).backward(**kwargs)
1852 else:
-> 1853 loss.backward(**kwargs)
File /databricks/python/lib/python3.10/site-packages/torch/_tensor.py:487, in Tensor.backward(self, gradient, retain_graph, create_graph, inputs)
477 if has_torch_function_unary(self):
478 return handle_torch_function(
479 Tensor.backward,
480 (self,),
(...)
485 inputs=inputs,
486 )
--> 487 torch.autograd.backward(
488 self, gradient, retain_graph, create_graph, inputs=inputs
489 )
File /databricks/python/lib/python3.10/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
195 retain_graph = create_graph
197 # The reason we repeat same the comment below is that
198 # some Python versions print out the first line of a multi-line function
199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
201 tensors, grad_tensors_, retain_graph, create_graph, inputs,
202 allow_unreachable=True, accumulate_grad=True)
File /databricks/python/lib/python3.10/site-packages/torch/autograd/function.py:274, in BackwardCFunction.apply(self, *args)
270 raise RuntimeError("Implementing both 'backward' and 'vjp' for a custom "
271 "Function is not allowed. You should only implement one "
272 "of them.")
273 user_fn = vjp_fn if vjp_fn is not Function.vjp else backward_fn
--> 274 return user_fn(self, *args)
File /databricks/python/lib/python3.10/site-packages/torch/utils/checkpoint.py:157, in CheckpointFunction.backward(ctx, *args)
153 if len(outputs_with_grad) == 0:
154 raise RuntimeError(
155 "none of output has requires_grad=True,"
156 " this checkpoint() is not necessary")
--> 157 torch.autograd.backward(outputs_with_grad, args_with_grad)
158 grads = tuple(inp.grad if isinstance(inp, torch.Tensor) else None
159 for inp in detached_inputs)
161 return (None, None) + grads
File /databricks/python/lib/python3.10/site-packages/torch/autograd/__init__.py:200, in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
195 retain_graph = create_graph
197 # The reason we repeat same the comment below is that
198 # some Python versions print out the first line of a multi-line function
199 # calls in the traceback and some print out the last line
--> 200 Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
201 tensors, grad_tensors_, retain_graph, create_graph, inputs,
202 allow_unreachable=True, accumulate_grad=True)
RuntimeError: Expected to mark a variable ready only once. This error is caused by one of the following reasons: 1) Use of a module parameter outside the `forward` function. Please make sure model parameters are not shared across multiple concurrent forward-backward passes. or try to use _set_static_graph() as a workaround if this module graph does not change during training loop.2) Reused parameters in multiple reentrant backward passes. For example, if you use multiple `checkpoint` functions to wrap the same part of your model, it would result in the same set of parameters been used by different reentrant backward passes multiple times, and hence marking a variable ready multiple times. DDP does not support such use cases in default. You can try to use _set_static_graph() as a workaround if your module graph does not change over iterations.
Parameter at index 191 has been marked as ready twice. This means that multiple autograd engine hooks have fired for this particular parameter during this iteration. You can set the environment variable TORCH_DISTRIBUTED_DEBUG to either INFO or DETAIL to print parameter names for further debugging.
Here is my code for Fine Tuning LLaMa v-2:
# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "finetuned_llama-7b"
# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token # llama_tokenizer.add_special_tokens({'pad_token': '[PAD]'})
llama_tokenizer.padding_side = "right" # Fix for fp16
# Quantization Config
quant_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_compute_dtype=torch.float16, #torch.bfloat16
bnb_4bit_use_double_quant = False
base_model = AutoModelForCausalLM.from_pretrained(
base_model_name,
quantization_config=quant_config,
device_map= {"": 0})
base_model.config.use_cache = False # Gradient checkpointing is used by default but not compatible with caching
base_model.config.pretraining_tp = 1
# LoRA Config
peft_parameters = LoraConfig(
r = 8,
lora_alpha=16,
lora_dropout=0.05,
target_modules = ['q_proj','v_proj','k_proj'],
bias="none",
task_type="CAUSAL_LM"
)
# Training Params
train_params = TrainingArguments(
output_dir="/tmp",
num_train_epochs=2,
per_device_train_batch_size=4,
gradient_accumulation_steps=1,
optim="paged_adamw_32bit",
save_strategy="epoch",
evaluation_strategy="epoch",
save_steps=25,
logging_steps=25,
learning_rate=3e-4,
weight_decay=0.001,
fp16=False,
bf16=False,
max_grad_norm=0.3,
max_steps=-1,
warmup_ratio=0.03,
group_by_length=True,
lr_scheduler_type="cosine",
report_to= "none",
log_level = "debug",
push_to_hub = False,
)
# Trainer
fine_tuning_trainer = SFTTrainer(
model=base_model,
train_dataset=train_data,
eval_dataset = val_data,
dataset_text_field="text",
max_seq_length=None,
peft_config=peft_parameters,
tokenizer=llama_tokenizer,
args=train_params
)
fine_tuning_trainer.train()
What is going wrong here?