CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling `cublasCreate(handle)`

frisayl · November 26, 2021, 9:55am

Hi, i’m having a tough time understanding what causes the indicated error. I am trying to develop a token classification model for ABSA with pytorch lightning and BERT, on colab, but got stuck at this error when calling the trainer. Can anybody give me some insight? Thanks in advance!

Here is the error traceback, if it can be useful.

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in fit(self, model, train_dataloaders, val_dataloaders, datamodule, train_dataloader, ckpt_path)
736 train_dataloaders = train_dataloader
737 self._call_and_handle_interrupt(
→ 738 self._fit_impl, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path
739 )
740

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _call_and_handle_interrupt(self, trainer_fn, *args, **kwargs)
680 “”"
681 try:
→ 682 return trainer_fn(*args, **kwargs)
683 # TODO: treat KeyboardInterrupt as BaseException (delete the code below) in v1.7
684 except KeyboardInterrupt as exception:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _fit_impl(self, model, train_dataloaders, val_dataloaders, datamodule, ckpt_path)
770 # TODO: ckpt_path only in v1.7
771 ckpt_path = ckpt_path or self.resume_from_checkpoint
→ 772 self._run(model, ckpt_path=ckpt_path)
773
774 assert self.state.stopped

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run(self, model, ckpt_path)
1193
1194 # dispatch start_training or start_evaluating or start_predicting
→ 1195 self._dispatch()
1196
1197 # plugin will finalized fitting (e.g. ddp_spawn will load trained model)

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _dispatch(self)
1272 self.training_type_plugin.start_predicting(self)
1273 else:
→ 1274 self.training_type_plugin.start_training(self)
1275
1276 def run_stage(self):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in start_training(self, trainer)
200 def start_training(self, trainer: “pl.Trainer”) → None:
201 # double dispatch to initiate the training loop
→ 202 self._results = trainer.run_stage()
203
204 def start_evaluating(self, trainer: “pl.Trainer”) → None:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in run_stage(self)
1282 if self.predicting:
1283 return self._run_predict()
→ 1284 return self._run_train()
1285
1286 def _pre_training_routine(self):

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_train(self)
1304 self.progress_bar_callback.disable()
1305
→ 1306 self._run_sanity_check(self.lightning_module)
1307
1308 # enable train mode

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/trainer/trainer.py in _run_sanity_check(self, ref_model)
1368 # run eval step
1369 with torch.no_grad():
→ 1370 self._evaluation_loop.run()
1371
1372 self.call_hook(“on_sanity_check_end”)

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
→ 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/dataloader/evaluation_loop.py in advance(self, *args, **kwargs)
107 dl_max_batches = self._max_batches[dataloader_idx]
108
→ 109 dl_outputs = self.epoch_loop.run(dataloader, dataloader_idx, dl_max_batches, self.num_dataloaders)
110
111 # store batch level output per dataloader

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/base.py in run(self, *args, **kwargs)
143 try:
144 self.on_advance_start(*args, **kwargs)
→ 145 self.advance(*args, **kwargs)
146 self.on_advance_end()
147 self.restarting = False

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py in advance(self, data_fetcher, dataloader_idx, dl_max_batches, num_dataloaders)
120 # lightning module methods
121 with self.trainer.profiler.profile(“evaluation_step_and_end”):
→ 122 output = self._evaluation_step(batch, batch_idx, dataloader_idx)
123 output = self._evaluation_step_end(output)
124

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/loops/epoch/evaluation_epoch_loop.py in _evaluation_step(self, batch, batch_idx, dataloader_idx)
215 self.trainer.lightning_module._current_fx_name = “validation_step”
216 with self.trainer.profiler.profile(“validation_step”):
→ 217 output = self.trainer.accelerator.validation_step(step_kwargs)
218
219 return output

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/accelerators/accelerator.py in validation_step(self, step_kwargs)
234 “”"
235 with self.precision_plugin.val_step_context():
→ 236 return self.training_type_plugin.validation_step(*step_kwargs.values())
237
238 def test_step(self, step_kwargs: Dict[str, Union[Any, int]]) → Optional[STEP_OUTPUT]:

/usr/local/lib/python3.7/dist-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py in validation_step(self, *args, **kwargs)
217
218 def validation_step(self, *args, **kwargs):
→ 219 return self.model.validation_step(*args, **kwargs)
220
221 def test_step(self, *args, **kwargs):

in validation_step(self, batch, batch_idx)
46 attention_mask = batch[“attention_mask”]
47 labels = batch[“labels”]
—> 48 loss, outputs = self(input_ids, attention_mask, labels)
49 #self.log(“val_loss”, loss, prog_bar=True, logger=True)
50 return loss

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []

in forward(self, input_ids, attention_mask, y)
12 def forward(self, input_ids, attention_mask, y=None):
13 print(‘input:’, input_ids.size(), ‘att’, attention_mask.size(), ‘y’, y.size())
—> 14 output = self.bert(input_ids, attention_mask, y)
15 logits = output.logits
16

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1118 input = bw_hook.setup_input_hook(input)
1119
→ 1120 result = forward_call(input, **kwargs)
1121 if _global_forward_hooks or self._forward_hooks:
1122 for hook in (_global_forward_hooks.values(), *self._forward_hooks.values()):

/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, labels, output_attentions, output_hidden_states, return_dict)
1733 output_attentions=output_attentions,
1734 output_hidden_states=output_hidden_states,
→ 1735 return_dict=return_dict,
1736 )
1737

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1118 input = bw_hook.setup_input_hook(input)
1119
→ 1120 result = forward_call(input, **kwargs)
1121 if _global_forward_hooks or self._forward_hooks:
1122 for hook in (_global_forward_hooks.values(), *self._forward_hooks.values()):

/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
1004 output_attentions=output_attentions,
1005 output_hidden_states=output_hidden_states,
→ 1006 return_dict=return_dict,
1007 )
1008 sequence_output = encoder_outputs[0]

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)
588 encoder_attention_mask,
589 past_key_value,
→ 590 output_attentions,
591 )
592

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
473 head_mask,
474 output_attentions=output_attentions,
→ 475 past_key_value=self_attn_past_key_value,
476 )
477 attention_output = self_attention_outputs[0]

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
405 encoder_attention_mask,
406 past_key_value,
→ 407 output_attentions,
408 )
409 attention_output = self.output(self_outputs[0], hidden_states)

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/transformers/models/bert/modeling_bert.py in forward(self, hidden_states, attention_mask, head_mask, encoder_hidden_states, encoder_attention_mask, past_key_value, output_attentions)
264 output_attentions=False,
265 ):
→ 266 mixed_query_layer = self.query(hidden_states)
267
268 # If this is instantiated as a cross-attention module, the keys

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
1100 if not (self._backward_hooks or self._forward_hooks or self._forward_pre_hooks or _global_backward_hooks
1101 or _global_forward_hooks or _global_forward_pre_hooks):
→ 1102 return forward_call(*input, **kwargs)
1103 # Do not call functions when jit is used
1104 full_backward_hooks, non_full_backward_hooks = [], []

/usr/local/lib/python3.7/dist-packages/torch/nn/modules/linear.py in forward(self, input)
101
102 def forward(self, input: Tensor) → Tensor:
→ 103 return F.linear(input, self.weight, self.bias)
104
105 def extra_repr(self) → str:

/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
→ 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850

RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling cublasCreate(handle)

ptrblck · November 29, 2021, 4:22am

This issue might be caused if you are running out of memory and cublas isn’t able to create its handle.
Reduce the batch size (or try to reduce the memory usage otherwise) and rerun the code.

frisayl · December 2, 2021, 7:18am

Thanks for your reply. Unfortunately, these measures aren’t solving my problem