GPU Error unspecified launch failure

I have 2x 32GB - V100 and when i am training/fine tuning a llm i notice that at some point my gpu memory usage in nvidia-smi was 25 Gb and 18 Gb and after some time it shows 32 Gb and 20 Gb.

So like will it keep on increasing and how to fix this if i am going to train for longer durations?

I have loaded 3B size model in 8-bit and i got this error after 72 mins of training.
The error is this.

RuntimeError Traceback (most recent call last)
Cell In[5], line 38
1 training_args = GRPOConfig(
2 learning_rate=5e-6,
3 adam_beta1=0.9,
(…)
22 output_dir=“outputs”,
23 )
25 trainer = GRPOTrainer(
26 model = model,
27 processing_class = tokenizer,
(…)
36 train_dataset = dataset,
37 )
—> 38 trainer.train()

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\transformers\trainer.py:2171, in Trainer.train(self, resume_from_checkpoint, trial, ignore_keys_for_eval, **kwargs)
2169 hf_hub_utils.enable_progress_bars()
2170 else:
→ 2171 return inner_training_loop(
2172 args=args,
2173 resume_from_checkpoint=resume_from_checkpoint,
2174 trial=trial,
2175 ignore_keys_for_eval=ignore_keys_for_eval,
2176 )

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\transformers\trainer.py:2531, in Trainer._inner_training_loop(self, batch_size, args, resume_from_checkpoint, trial, ignore_keys_for_eval)
2524 context = (
2525 functools.partial(self.accelerator.no_sync, model=model)
2526 if i != len(batch_samples) - 1
2527 and self.accelerator.distributed_type != DistributedType.DEEPSPEED
2528 else contextlib.nullcontext
2529 )
2530 with context():
→ 2531 tr_loss_step = self.training_step(model, inputs, num_items_in_batch)
2533 if (
2534 args.logging_nan_inf_filter
2535 and not is_torch_xla_available()
2536 and (torch.isnan(tr_loss_step) or torch.isinf(tr_loss_step))
2537 ):
2538 # if loss is nan or inf simply add the average of previous logged losses
2539 tr_loss = tr_loss + tr_loss / (1 + self.state.global_step - self._globalstep_last_logged)

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\transformers\trainer.py:3669, in Trainer.training_step(self, model, inputs, num_items_in_batch)
3666 if hasattr(self.optimizer, “train”) and callable(self.optimizer.train):
3667 self.optimizer.train()
→ 3669 inputs = self._prepare_inputs(inputs)
3670 if is_sagemaker_mp_enabled():
3671 loss_mb = smp_forward_backward(model, inputs, self.args.gradient_accumulation_steps)

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\trl\trainer\grpo_trainer.py:477, in GRPOTrainer._prepare_inputs(self, inputs)
475 with torch.inference_mode():
476 if self.ref_model is not None:
→ 477 ref_per_token_logps = self._get_per_token_logps(
478 self.ref_model, prompt_completion_ids, attention_mask, logits_to_keep
479 )
480 else:
481 with self.accelerator.unwrap_model(self.model).disable_adapter():

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\trl\trainer\grpo_trainer.py:380, in GRPOTrainer._get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep)
378 def _get_per_token_logps(self, model, input_ids, attention_mask, logits_to_keep):
379 # We add 1 to logits_to_keep because the last logits of the sequence is later excluded
→ 380 logits = model(
381 input_ids=input_ids, attention_mask=attention_mask, logits_to_keep=logits_to_keep + 1
382 ).logits # (B, L, V)
383 logits = logits[:, :-1, :] # (B, L-1, V), exclude the last logit: it corresponds to the next token pred
385 input_ids = input_ids[:, -logits_to_keep:]

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\torch\nn\modules\module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
→ 1739 return self._call_impl(*args, **kwargs)

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\torch\nn\modules\module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don’t have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\accelerate\utils\operations.py:819, in convert_outputs_to_fp32..forward(*args, **kwargs)
818 def forward(*args, **kwargs):
→ 819 return model_forward(*args, **kwargs)

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\accelerate\utils\operations.py:807, in ConvertOutputsToFp32.call(self, *args, **kwargs)
806 def call(self, *args, **kwargs):
→ 807 return convert_to_fp32(self.model_forward(*args, **kwargs))

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\accelerate\utils\operations.py:786, in convert_to_fp32(tensor)
780 def _is_fp16_bf16_tensor(tensor):
781 return (is_torch_tensor(tensor) or hasattr(tensor, “dtype”)) and tensor.dtype in (
782 torch.float16,
783 torch.bfloat16,
784 )
→ 786 return recursively_apply(_convert_to_fp32, tensor, test_type=_is_fp16_bf16_tensor)

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\accelerate\utils\operations.py:118, in recursively_apply(func, data, test_type, error_on_other_type, *args, **kwargs)
107 return honor_type(
108 data,
109 (
(…)
114 ),
115 )
116 elif isinstance(data, Mapping):
117 return type(data)(
→ 118 {
119 k: recursively_apply(
120 func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
121 )
122 for k, v in data.items()
123 }
124 )
125 elif test_type(data):
126 return func(data, *args, **kwargs)

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\accelerate\utils\operations.py:119, in (.0)
107 return honor_type(
108 data,
109 (
(…)
114 ),
115 )
116 elif isinstance(data, Mapping):
117 return type(data)(
118 {
→ 119 k: recursively_apply(
120 func, v, *args, test_type=test_type, error_on_other_type=error_on_other_type, **kwargs
121 )
122 for k, v in data.items()
123 }
124 )
125 elif test_type(data):
126 return func(data, *args, **kwargs)

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\accelerate\utils\operations.py:126, in recursively_apply(func, data, test_type, error_on_other_type, *args, **kwargs)
117 return type(data)(
118 {
119 k: recursively_apply(
(…)
123 }
124 )
125 elif test_type(data):
→ 126 return func(data, *args, **kwargs)
127 elif error_on_other_type:
128 raise TypeError(
129 f"Unsupported types ({type(data)}) passed to {func.__name__}. Only nested list/tuple/dicts of "
130 f"objects that are valid for {test_type.__name__} should be passed."
131 )

File c:\ProgramData\anaconda3\envs\llmrl\Lib\site-packages\accelerate\utils\operations.py:778, in convert_to_fp32.._convert_to_fp32(tensor)
777 def _convert_to_fp32(tensor):
→ 778 return tensor.float()

RuntimeError: CUDA error: unspecified launch failure
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with TORCH_USE_CUDA_DSA to enable device-side assertions.

My training arguments and settings are

training_args = GRPOConfig(
learning_rate=5e-6,
adam_beta1=0.9,
adam_beta2=0.99,
weight_decay=0.1,
warmup_ratio=0.1,
lr_scheduler_type=“cosine”,
optim=“adamw_torch”,
logging_steps=1,
bf16=torch.cuda.is_bf16_supported(),
fp16=not torch.cuda.is_bf16_supported(),
per_device_train_batch_size=1,
gradient_accumulation_steps=1,
num_generations=10,
max_prompt_length=512,
max_completion_length=250,
num_train_epochs=1,
max_steps=1000,
save_steps=500,
max_grad_norm=0.1,
report_to=“none”,
output_dir=“outputs”,
)

trainer = GRPOTrainer(
model = model,
processing_class = tokenizer,
reward_funcs = [
xmlcount_reward_func,
soft_format_reward_func,
strict_format_reward_func,
int_reward_func,
correctness_reward_func,
],
args = training_args,
train_dataset = dataset,
)
trainer.train()

man what is this AI generated suggestions.
first of all model is loaded in 8 bit.
secondly there is no overheating issues.

and where should i use this command. " CUDA_LAUNCH_BLOCKING=1" ??