CUDA out of memory even when I have enough memory

I’m trying fine-tune llama 3 model with large dataset and long sequence length. I running this on a machine with 2 GPUs each with 24GB and still I’m getting this below mentioned error

Traceback (most recent call last):
  File "/home/llm04/Ejyle_Sutherland_NLP/finetune/finetune_with_peft.py", line 152, in <module>
    trainer.train()
  File "/home/llm04/new_venv/lib/python3.10/site-packages/trl/trainer/sft_trainer.py", line 440, in train
    output = super().train(*args, **kwargs)
  File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 1885, in train
    return inner_training_loop(
  File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 2216, in _inner_training_loop
    tr_loss_step = self.training_step(model, inputs)
  File "/home/llm04/new_venv/lib/python3.10/site-packages/transformers/trainer.py", line 3250, in training_step
    self.accelerator.backward(loss)
  File "/home/llm04/new_venv/lib/python3.10/site-packages/accelerate/accelerator.py", line 2134, in backward
    loss.backward(**kwargs)
  File "/home/llm04/new_venv/lib/python3.10/site-packages/torch/_tensor.py", line 525, in backward
    torch.autograd.backward(
  File "/home/llm04/new_venv/lib/python3.10/site-packages/torch/autograd/__init__.py", line 267, in backward
    _engine_run_backward(
  File "/home/llm04/new_venv/lib/python3.10/site-packages/torch/autograd/graph.py", line 744, in _engine_run_backward
    return Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 500.00 MiB. GPU  has a total capacity of 21.96 GiB of which 277.06 MiB is free. Including non-PyTorch
memory, this process has 21.60 GiB memory in use. Of the allocated memory 20.51 GiB is allocated by PyTorch, and 869.64 MiB is reserved by PyTorch but unallocated. If
reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management

Here is the code which I’m using

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    get_peft_model,
)
import os
import torch
from torch.utils.data import DataLoader
import wandb
import pandas as pd
from datasets import Dataset, load_dataset
from trl import SFTTrainer, setup_chat_format
from huggingface_hub import login

torch.cuda.empty_cache()

# # Insert your Hugging Face token here
hf_token = "token"

# # Login to Hugging Face Hub
login(token=hf_token)

# # Define paths and model parameters
# base_model = "meta-llama/Meta-Llama-3-8B"
base_model = "/home/llm02/llama3_api/Meta-Llama-3-8B"
dataset_path = "/home/llm02/llama3_api/Ejyle_Sutherland_NLP/finetune/DPO-training/Dataset/final_output_file_10_000.xlsx"
new_model = "first10K"
torch_dtype = torch.bfloat16
attn_implementation = "eager"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(base_model)
model, tokenizer = setup_chat_format(model, tokenizer)
model.config.attention_slicing = "auto"
    
 
# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=['up_proj', 'down_proj', 'gate_proj', 'k_proj', 'q_proj', 'v_proj', 'o_proj'],
)
model = get_peft_model(model, peft_config)
model.config.attention_slicing = "auto"

#if torch.cuda.device_count() > 1:
#    print("----------------------GPU------------------------")
#    model.is_parallelizable = True
#    model.model_parallel = True

# Load and preprocess the dataset
df = pd.read_excel(dataset_path)
dataset = Dataset.from_pandas(df)
#dataset=load_dataset('csv',data_files=dataset_path)

system_prompt = (
    """you are a medical coding expert, who have access to medical coding guidelines. extract all the medical conditions from all the sections including clinical information/condition and its associated ICD-10 descriptions(description only without code) with anatomical locations for the given clinical text of all the sections, exclude negated conditions from the given radiology record and return only the ICD-10 descriptions(description only without code) of non-negated conditions in this json format{"description":[]}, except json do not send anything else."""
)

template_tokenizer= AutoTokenizer.from_pretrained("HuggingFaceH4/zephyr-7b-beta")


def format_chat_template(row):
    user_content = row["sentence"] if row["sentence"] is not None else ""
    assistant_content = row["condition"] if row["condition"] is not None else ""
    messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": assistant_content}]
    #print(row_json,"------------------------------------------")
    final = template_tokenizer.apply_chat_template(messages, tokenize=False)
    #print(final,"----------")   
    return {'text': final}

#print(format_chat_template(dataset[0]))
print(f"original dataset: {len(dataset)}")
dataset = dataset.map(format_chat_template, remove_columns=dataset.column_names)
print(dataset[0],"---------------------------------map-------------------------------")
print(len(dataset))
dataset = dataset.shuffle(seed=66).select(range(158784))

print(f"shuffled dataset: {len(dataset)}")

# Split the dataset into a training and validation set
dataset = dataset.train_test_split(test_size=0.00539)
print(dataset['train'][0],"----------------split-------------------")
# Training parameters
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=5,
    eval_strategy="epoch",
    eval_steps=10000,  # Change to 10 to avoid too frequent evaluation
    logging_steps=500,
    warmup_steps=1000,
    logging_strategy="epoch",
    learning_rate=2e-5,
    fp16=False,
    bf16=True,
    group_by_length=True,
    load_best_model_at_end=True,
    metric_for_best_model="loss",
    save_total_limit=3,
    save_strategy="epoch",
    #max_steps=(len(dataset)//per_device_train_batch_size)*num_train_epochs
    # report_to="wandb"
)
#device = torch.device("cuda:0")
#peft_model= model.to(device)
# Supervised fine-tuning (SFT) trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    max_seq_length=1024,
    packing=False,
    )

trainer.train()


model.config.use_cache = True
trainer.model.save_pretrained(new_model)