Hi, I get the same error.
Everything works fine when I use only one gpu. But, if I use two, I get the same error. And, I reduced the size of my problem so it runs and finishes in one cpu swiftly.
File “/home/server/Escritorio/AlejandroF/agfa/simple-transformers/train_language_model.py”, line 138, in
model.train_model(train_filename)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/simpletransformers/language_modeling/language_modeling_model.py”, line 431, in train_model
global_step, training_details = self.train(
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/simpletransformers/language_modeling/language_modeling_model.py”, line 774, in train
model(inputs, labels=labels)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py”, line 161, in forward
outputs = self.parallel_apply(replicas, inputs, kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/parallel/data_parallel.py”, line 171, in parallel_apply
return parallel_apply(replicas, inputs, kwargs, self.device_ids[:len(replicas)])
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py”, line 86, in parallel_apply
output.reraise()
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/_utils.py”, line 428, in reraise
raise self.exc_type(msg)
RuntimeError: Caught RuntimeError in replica 0 on device 0.
Original Traceback (most recent call last):
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/parallel/parallel_apply.py”, line 61, in _worker
output = module(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py”, line 1329, in forward
outputs = self.bert(
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py”, line 991, in forward
encoder_outputs = self.encoder(
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py”, line 582, in forward
layer_outputs = layer_module(
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py”, line 470, in forward
self_attention_outputs = self.attention(
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py”, line 401, in forward
self_outputs = self.self(
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/transformers/models/bert/modeling_bert.py”, line 267, in forward
mixed_query_layer = self.query(hidden_states)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/module.py”, line 727, in _call_impl
result = self.forward(*input, **kwargs)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/modules/linear.py”, line 93, in forward
return F.linear(input, self.weight, self.bias)
File “/home/server/anaconda3/envs/agfa/lib/python3.9/site-packages/torch/nn/functional.py”, line 1692, in linear
output = input.matmul(weight.t())
RuntimeError: CUDA error: CUBLAS_STATUS_NOT_INITIALIZED when calling cublasCreate(handle)