Hi,
I have a model that is running fine on 3 machines with Titan X. Iβve tried to run it on a Tesla P100-SXM2-16GB and get this error:
Traceback (most recent call last):
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/runpy.py", line 193, in _run_module_as_main
"__main__", mod_spec)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/runpy.py", line 85, in _run_code
exec(code, run_globals)
File "/data/dgx1/oanuru/experiments/src/main/python/nips2017/__main__.py", line 210, in <module>
main(arg_tools.parse_args(config.Config, FIXED_TYPES, APP_NAME, APP_DESC))
File "/data/dgx1/oanuru/experiments/src/main/python/nips2017/__main__.py", line 207, in main
exp.run()
File "/data/dgx1/oanuru/experiments/src/main/python/nips2017/experiment/base_experiment.py", line 431, in run
self._train(epoch)
File "/data/dgx1/oanuru/experiments/src/main/python/nips2017/experiment/experiment_2.py", line 402, in _train
decodings = self._model(batch.premise, batch.hypothesis, decode_with_tf=(not self._conf.no_teacher_forcing))
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/nn/modules/module.py", line 206, in __call__
result = self.forward(*input, **kwargs)
File "/data/dgx1/oanuru/experiments/src/main/python/nips2017/experiment/models/model_exp_2.py", line 123, in forward
decoding_1 = self.autoenc(sentence_1, decode_with_tf=decode_with_tf)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/nn/modules/module.py", line 206, in __call__
result = self.forward(*input, **kwargs)
File "/data/dgx1/oanuru/experiments/src/main/python/nips2017/models/autoencoder.py", line 75, in forward
enc = self.encoder(inputs)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/nn/modules/module.py", line 206, in __call__
result = self.forward(*input, **kwargs)
File "/data/dgx1/oanuru/experiments/src/main/python/nips2017/models/simple_encoder.py", line 188, in forward
output, (hidden, cell) = self.rnn(current_inputs, (hidden, cell))
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/nn/modules/module.py", line 206, in __call__
result = self.forward(*input, **kwargs)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/nn/modules/rnn.py", line 91, in forward
output, hidden = func(input, self.all_weights, hx)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/nn/_functions/rnn.py", line 343, in forward
return func(input, *fargs, **fkwargs)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/autograd/function.py", line 202, in _do_forward
flat_output = super(NestedIOFunction, self)._do_forward(*flat_input)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/autograd/function.py", line 224, in forward
result = self.forward_extended(*nested_tensors)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/nn/_functions/rnn.py", line 285, in forward_extended
cudnn.rnn.forward(self, input, hx, weight, output, hy)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/backends/cudnn/rnn.py", line 296, in forward
ctypes.c_void_p(fn.reserve.data_ptr()), fn.reserve.size(0)
File "/users/oanuru/anaconda3/envs/nips/lib/python3.6/site-packages/torch/backends/cudnn/__init__.py", line 249, in check_error
raise CuDNNError(status)
torch.backends.cudnn.CuDNNError: 8: b'CUDNN_STATUS_EXECUTION_FAILED'
I printed the cuda and cudnn versions from pytorch and got:
+========================+
| GPU INFO |
+===============+========+
| CUDA version | 8.0.44 |
| cuDNN version | 6021 |
+===============+========+
Any help would be appreciated.
Thanks!
Oana