CUBLAS_STATUS_NOT_SUPPORTED when calling `cublasGemmStridedBatchedExFix

Hi,
I 'm trying to peft a model on my GPU (MX110) but when the train function is called, I get this run time error.
CUDA 11.8 is installed with the latest torch Python package. On CPU the training is working as intended.

The full error message:
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_SUPPORTED when calling cublasGemmStridedBatchedExFix(handle, opa, opb, (int)m, (int)n, (int)k, (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, b, CUDA_R_16BF, (int)ldb, strideb, (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec, (int)num_batches, CUDA_R_32F, CUBLAS_GEMM_DEFAULT_TENSOR_OP)

I already looked for solutions online but I could not find anything useful.
Any advice would be appreciated

Could you post a minimal and executable code snippet reproducing the issue? Are other native kernels working fine in this setup?

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer
import torch
from datasets import load_dataset

dataset = load_dataset("knkarthick/dialogsum")
model_name='google/flan-t5-base'
original_model = AutoModelForSeq2SeqLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, ignore_mismatched_sizes=True)
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function (example):
    start_prompt = 'Summarize the following conversation. \n\n'
    end_prompt = '\n\nSummary: '
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["dialogue"]]
    example['input_ids'] = tokenizer (prompt, padding="max_length", truncation =True, return_tensors="pt").input_ids

    example['labels'] = tokenizer (example["summary"], padding="max_length", truncation=True, return_tensors="pt").input_ids 
    
    return example

tokenized_datasets = dataset.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['id', 'topic', 'dialogue', 'summary', ])

from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=32, #rank
    lora_alpha = 32,
    target_modules = ["q", "v"],
    lora_dropout = 0.05,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM #FLAN-T5
)

peft_model = get_peft_model(original_model,
                            lora_config)
print(torch.cuda.is_available())
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
peft_model.to(device)

peft_training_args = TrainingArguments(
    output_dir="./peft",
    auto_find_batch_size=True,
    learning_rate=1e-4,
    num_train_epochs=1,
    logging_steps=1,
    max_steps=1
)


peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset=tokenized_datasets["train"],
)

peft_trainer.train()