pretty much question but I am loading model in 8 bit and using peft. also batch size =1. and everything is set to low a per my understanding.
::Also this script and settings worked for a different dataset yesterday .
device = “cuda” if torch.cuda.is_available() else “cpu”
model_id = “Qwen/Qwen2.5-VL-3B-Instruct”
EPOCHS = 1
BATCH_SIZE = 1
GRADIENT_CHECKPOINTING = True
USE_REENTRANT = False
LEARNING_RATE = 1e-5
LOGGING_STEPS = 50
EVAL_STEPS = 50
SAVE_STEPS = 50
EVAL_STRATEGY = “steps”
SAVE_STRATEGY = “steps”
METRIC_FOR_BEST_MODEL = “eval_loss”
LOAD_BEST_MODEL_AT_END = True
MAX_GRAD_NORM = 1.0
DATASET_KWARGS = {“skip_prepare_dataset”: True}
REMOVE_UNUSED_COLUMNS = False
MAX_SEQ_LEN = 128
NUM_STEPS = (58 // BATCH_SIZE) * EPOCHS ### Print(len(train_dataset) to get this ###
print(f"NUM_STEPS: {NUM_STEPS}")
from transformers import Trainer, TrainingArguments
from transformers.optimization import AdamW
import torch
device = “cuda” if torch.cuda.is_available() else “cpu”
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE)
training_args = TrainingArguments(
output_dir=“./results”,
learning_rate=LEARNING_RATE,
per_device_train_batch_size=BATCH_SIZE,
num_train_epochs=EPOCHS,
weight_decay=0.01,
logging_dir=“./logs”,
logging_steps=LOGGING_STEPS,
save_steps=SAVE_STEPS,
gradient_checkpointing=GRADIENT_CHECKPOINTING,
max_grad_norm=MAX_GRAD_NORM,
remove_unused_columns=REMOVE_UNUSED_COLUMNS,
save_strategy=SAVE_STRATEGY,
no_cuda=False
)
def batch_collate_fn(examples):
batched_inputs =
for example in examples:
texts = [processor.apply_chat_template(example, tokenize=False)]
image_inputs = [example[1][“content”][0][“image”]]
inputs = processor(
text=texts,
images=image_inputs,
return_tensors=“pt”,
padding=True
)
inputs[“labels”] = inputs[“input_ids”].clone()
batched_inputs.append(inputs)
# Combine all batches
combined_inputs = {
key: torch.cat([b[key] for b in batched_inputs])
for key in batched_inputs[0].keys()
}
return combined_inputs
trainer = Trainer(
model=model,
args=training_args,
train_dataset=train_dataset,
# eval_dataset=eval_dataset,
optimizers=(optimizer, None),
data_collator=batch_collate_fn
)
print(“Starting evaluation…”)
metric = trainer.evaluate()
print(f"Evaluation metrics: {metric}")
print(“-”*30)
print(“Starting training…”)
trainer.train()
OutOfMemoryError Traceback (most recent call last)
Cell In[8], line 62
59 print(“-”*30)
61 print(“Starting training…”)
—> 62 trainer.train()
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\trainer.py:2184, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2182 hf_hub_utils.enable_progress_bars()
2183 else:
→ 2184 return inner_training_loop(
2185 args=args,
2186 resume_from_checkpoint=resume_from_checkpoint,
2187 trial=trial,
2188 ignore_keys_for_eval=ignore_keys_for_eval,
2189 )
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\trainer.py:2490, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2483 context = (
2484 functools.partial(self.accelerator.no_sync, model=model)
2485 if i != len(batch_samples) - 1
2486 and self.accelerator.distributed_type != DistributedType.DEEPSPEED
2487 else contextlib.nullcontext
2488 )
2489 with context():
→ 2490 tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
2492 if (
2493 args.logging_nan_inf_filter
2494 and not is_torch_xla_available()
2495 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2496 ):
2497 # if loss is nan or inf simply add the average of previous logged losses
2498 tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\trainer.py:3598, in Trainer.training_step(self, model, inputs, num_items_in_batch)
3595 return loss_mb.reduce_mean().detach().to(self.args.device)
3597 with self.compute_loss_context_manager():
→ 3598 loss = self.compute_loss(model, inputs, num_items_in_batch=num_items_in_batch)
3600 del inputs
3601 if (
3602 self.args.torch_empty_cache_steps is not None
3603 and self.state.global_step % self.args.torch_empty_cache_steps == 0
3604 ):
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\trainer.py:3659, in Trainer.compute_loss(self, model, inputs, return_outputs, num_items_in_batch)
3657 loss_kwargs[“num_items_in_batch”] = num_items_in_batch
3658 inputs = {**inputs, **loss_kwargs}
→ 3659 outputs = model(**inputs)
3660 # Save past state if it exists
3661 # TODO: this needs to be fixed and made cleaner later.
3662 if self.args.past_index >= 0:
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
→ 1739 return self._call_impl(*args, **kwargs)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don’t have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\peft\peft_model.py:1719, in PeftModelForCausalLM.forward(self, input_ids, attention_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict, task_ids, **kwargs)
1717 with self._enable_peft_forward_hooks(**kwargs):
1718 kwargs = {k: v for k, v in kwargs.items() if k not in self.special_peft_forward_args}
→ 1719 return self.base_model(
1720 input_ids=input_ids,
1721 attention_mask=attention_mask,
1722 inputs_embeds=inputs_embeds,
1723 labels=labels,
1724 output_attentions=output_attentions,
1725 output_hidden_states=output_hidden_states,
1726 return_dict=return_dict,
1727 **kwargs,
1728 )
1730 batch_size = _get_batch_size(input_ids, inputs_embeds)
1731 if attention_mask is not None:
1732 # concat prompt attention mask
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
→ 1739 return self._call_impl(*args, **kwargs)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don’t have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\peft\tuners\tuners_utils.py:197, in BaseTuner.forward(self, *args, **kwargs)
196 def forward(self, *args: Any, **kwargs: Any):
→ 197 return self.model.forward(*args, **kwargs)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\accelerate\hooks.py:170, in add_hook_to_module..new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
→ 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\models\qwen2_5_vl\modeling_qwen2_5_vl.py:1740, in Qwen2_5_VLForConditionalGeneration.forward(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict, pixel_values, pixel_values_videos, image_grid_thw, video_grid_thw, rope_deltas, cache_position, second_per_grid_ts)
1738 if pixel_values is not None:
1739 pixel_values = pixel_values.type(self.visual.dtype)
→ 1740 image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
1741 n_image_tokens = (input_ids == self.config.image_token_id).sum().item()
1742 n_image_features = image_embeds.shape[0]
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
→ 1739 return self._call_impl(*args, **kwargs)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don’t have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\accelerate\hooks.py:170, in add_hook_to_module..new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
→ 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\models\qwen2_5_vl\modeling_qwen2_5_vl.py:497, in Qwen2_5_VisionTransformerPretrainedModel.forward(self, hidden_states, grid_thw)
495 cu_seqlens_now = cu_window_seqlens
496 if self.gradient_checkpointing and self.training:
→ 497 hidden_states = self._gradient_checkpointing_func(
498 blk.call, hidden_states, cu_seqlens_now, rotary_pos_emb
499 )
500 else:
501 hidden_states = blk(
502 hidden_states,
503 cu_seqlens=cu_seqlens_now,
504 rotary_pos_emb=rotary_pos_emb,
505 )
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch_compile.py:32, in _disable_dynamo..inner(*args, **kwargs)
29 disable_fn = torch._dynamo.disable(fn, recursive)
30 fn.__dynamo_disable = disable_fn
—> 32 return disable_fn(*args, **kwargs)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch_dynamo\eval_frame.py:745, in DisableContext.call.._fn(*args, **kwargs)
741 prior_skip_guard_eval_unsafe = set_skip_guard_eval_unsafe(
742 _is_skip_guard_eval_unsafe_stance()
743 )
744 try:
→ 745 return fn(*args, **kwargs)
746 finally:
747 _maybe_set_eval_frame(prior)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\utils\checkpoint.py:489, in checkpoint(function, use_reentrant, context_fn, determinism_check, debug, *args, **kwargs)
484 if context_fn is not noop_context_fn or debug is not False:
485 raise ValueError(
486 "Passing context_fn
or debug
is only supported when "
487 “use_reentrant=False.”
488 )
→ 489 return CheckpointFunction.apply(function, preserve, *args)
490 else:
491 gen = _checkpoint_without_reentrant_generator(
492 function, preserve, context_fn, determinism_check, debug, *args, **kwargs
493 )
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\autograd\function.py:575, in Function.apply(cls, *args, **kwargs)
572 if not torch._C._are_functorch_transforms_active():
573 # See NOTE: [functorch vjp and autograd interaction]
574 args = _functorch.utils.unwrap_dead_wrappers(args)
→ 575 return super().apply(*args, **kwargs) # type: ignore[misc]
577 if not is_setup_ctx_defined:
578 raise RuntimeError(
579 "In order to use an autograd.Function with functorch transforms "
580 "(vmap, grad, jvp, jacrev, …), it must override the setup_context "
581 "staticmethod. For more details, please see "
582 “Extending torch.func with autograd.Function — PyTorch main documentation”
583 )
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\utils\checkpoint.py:264, in CheckpointFunction.forward(ctx, run_function, preserve_rng_state, *args)
261 ctx.save_for_backward(*tensor_inputs)
263 with torch.no_grad():
→ 264 outputs = run_function(*args)
265 return outputs
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
→ 1739 return self._call_impl(*args, **kwargs)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don’t have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\accelerate\hooks.py:170, in add_hook_to_module..new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
→ 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\models\qwen2_5_vl\modeling_qwen2_5_vl.py:297, in Qwen2_5_VLVisionBlock.forward(self, hidden_states, cu_seqlens, rotary_pos_emb)
296 def forward(self, hidden_states, cu_seqlens, rotary_pos_emb) → torch.Tensor:
→ 297 hidden_states = hidden_states + self.attn(
298 self.norm1(hidden_states),
299 cu_seqlens=cu_seqlens,
300 rotary_pos_emb=rotary_pos_emb,
301 )
302 hidden_states = hidden_states + self.mlp(self.norm2(hidden_states))
303 return hidden_states
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
→ 1739 return self._call_impl(*args, **kwargs)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don’t have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\accelerate\hooks.py:170, in add_hook_to_module..new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
→ 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
File c:\ProgramData\anaconda3\envs\qwen\lib\site-packages\transformers\models\qwen2_5_vl\modeling_qwen2_5_vl.py:272, in Qwen2_5_VLVisionSdpaAttention.forward(self, hidden_states, cu_seqlens, rotary_pos_emb)
270 k = k.transpose(0, 1)
271 v = v.transpose(0, 1)
→ 272 attn_output = F.scaled_dot_product_attention(q, k, v, attention_mask, dropout_p=0.0)
273 attn_output = attn_output.transpose(0, 1)
274 attn_output = attn_output.reshape(seq_length, -1)
OutOfMemoryError: CUDA out of memory. Tried to allocate 65.62 GiB. GPU 0 has a total capacity of 31.86 GiB of which 24.54 GiB is free. Of the allocated memory 6.68 GiB is allocated by PyTorch, and 254.10 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation. See documentation for Memory Management (CUDA semantics — PyTorch 2.6 documentation)
Nvidia-smi after getting error
±----------------------------------------------------------------------------------------+
| NVIDIA-SMI 571.96 Driver Version: 571.96 CUDA Version: 12.8 |
|-----------------------------------------±-----------------------±---------------------+
| GPU Name Driver-Model | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+========================+======================|
| 0 Tesla V100-PCIE-32GB TCC | 00000000:18:00.0 Off | 0 |
| N/A 61C P0 46W / 250W | 7497MiB / 32768MiB | 0% Default |
| | | N/A |
±----------------------------------------±-----------------------±---------------------+
| 1 Tesla V100-PCIE-32GB TCC | 00000000:86:00.0 Off | 0 |
| N/A 46C P0 39W / 250W | 2305MiB / 32768MiB | 0% Default |
| | | N/A |
±----------------------------------------±-----------------------±---------------------+
±----------------------------------------------------------------------------------------+
| Processes: |
| GPU GI CI PID Type Process name GPU Memory |
| ID ID Usage |
|=========================================================================================|
| 0 N/A N/A 23300 C …naconda3\envs\qwen\python.exe 7482MiB |
| 1 N/A N/A 23300 C …naconda3\envs\qwen\python.exe 2294MiB |
±----------------------------------------------------------------------------------------+