Hello,
I also run into the same problem these days and not sure how to tackle it. I have already set CUDA_LAUNCH_BLOCKING=1 but I still receive the same error. Seems like it’s because the number of labels and number of output units is not equal? but I’m not sure how to verify this.
Could you please take a look at the error below and I would really appreciate if you could give me some suggestions. I’m quite new with PyTorch and deep learning so please bear with me.
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<timed eval> in <module>
~/.local/lib/python3.6/site-packages/transformers/trainer.py in train(self, model_path, trial)
761 continue
762
--> 763 tr_loss += self.training_step(model, inputs)
764 self.total_flos += self.floating_point_ops(inputs)
765
~/.local/lib/python3.6/site-packages/transformers/trainer.py in training_step(self, model, inputs)
1111 loss = self.compute_loss(model, inputs)
1112 else:
-> 1113 loss = self.compute_loss(model, inputs)
1114
1115 if self.args.n_gpu > 1:
~/.local/lib/python3.6/site-packages/transformers/trainer.py in compute_loss(self, model, inputs)
1135 Subclass and override for custom behavior.
1136 """
-> 1137 outputs = model(**inputs)
1138 # Save past state if it exists
1139 if self.args.past_index >= 0:
~/.local/lib/python3.6/site-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
725 result = self._slow_forward(*input, **kwargs)
726 else:
--> 727 result = self.forward(*input, **kwargs)
728 for hook in itertools.chain(
729 _global_forward_hooks.values(),
~/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in forward(self, *inputs, **kwargs)
159 return self.module(*inputs[0], **kwargs[0])
160 replicas = self.replicate(self.module, self.device_ids[:len(inputs)])
--> 161 outputs = self.parallel_apply(replicas, inputs, kwargs)
162 return self.gather(outputs, self.output_device)
163
~/.local/lib/python3.6/site-packages/torch/nn/parallel/data_parallel.py in parallel_apply(self, replicas, inputs, kwargs)
169
170 def parallel_apply(self, replicas, inputs, kwargs):
--> 171 return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
172
173 def gather(self, outputs, output_device):
~/.local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py in parallel_apply(modules, inputs, kwargs_tup, devices)
84 output = results[i]
85 if isinstance(output, ExceptionWrapper):
---> 86 output.reraise()
87 outputs.append(output)
88 return outputs
~/.local/lib/python3.6/site-packages/torch/_utils.py in reraise(self)
426 # have message field
427 raise self.exc_type(message=msg)
--> 428 raise self.exc_type(msg)
429
430
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File "/home/tlqn/.local/lib/python3.6/site-packages/torch/nn/parallel/parallel_apply.py", line 61, in _worker
output = module(*input, **kwargs)
File "/home/tlqn/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/tlqn/.local/lib/python3.6/site-packages/transformers/modeling_albert.py", line 796, in forward
return_dict=return_dict,
File "/home/tlqn/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/tlqn/.local/lib/python3.6/site-packages/transformers/modeling_albert.py", line 690, in forward
return_dict=return_dict,
File "/home/tlqn/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/tlqn/.local/lib/python3.6/site-packages/transformers/modeling_albert.py", line 421, in forward
hidden_states = self.embedding_hidden_mapping_in(hidden_states)
File "/home/tlqn/.local/lib/python3.6/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/tlqn/.local/lib/python3.6/site-packages/torch/nn/modules/linear.py", line 93, in forward
return F.linear(input, self.weight, self.bias)
File "/home/tlqn/.local/lib/python3.6/site-packages/torch/nn/functional.py", line 1692, in linear
output = input.matmul(weight.t())
RuntimeError: CUDA error: CUBLAS_STATUS_ALLOC_FAILED when calling `cublasCreate(handle)`