Hi,
I am having an error that I have not seen before. It occurs during the backward() call:
Traceback (most recent call last):
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/trainer/trainer.py", line 1045, in _run_train
self.fit_loop.run()
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/fit_loop.py", line 200, in advance
epoch_output = self.epoch_loop.run(train_dataloader)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 131, in advance
batch_output = self.batch_loop.run(batch, self.iteration_count, self._dataloader_idx)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 100, in run
super().run(batch, batch_idx, dataloader_idx)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/base.py", line 111, in run
self.advance(*args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 147, in advance
result = self._run_optimization(batch_idx, split_batch, opt_idx, optimizer)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 201, in _run_optimization
self._optimizer_step(optimizer, opt_idx, batch_idx, closure)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 402, in _optimizer_step
using_lbfgs=is_lbfgs,
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/core/lightning.py", line 1593, in optimizer_step
optimizer.step(closure=optimizer_closure)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/core/optimizer.py", line 209, in step
self.__optimizer_step(*args, closure=closure, profiler_name=profiler_name, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/core/optimizer.py", line 129, in __optimizer_step
trainer.accelerator.optimizer_step(optimizer, self._optimizer_idx, lambda_closure=closure, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 296, in optimizer_step
self.run_optimizer_step(optimizer, opt_idx, lambda_closure, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 303, in run_optimizer_step
self.training_type_plugin.optimizer_step(optimizer, lambda_closure=lambda_closure, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 226, in optimizer_step
optimizer.step(closure=lambda_closure, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/torch/optim/lr_scheduler.py", line 65, in wrapper
return wrapped(*args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/torch/optim/optimizer.py", line 88, in wrapper
return func(*args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/torch/autograd/grad_mode.py", line 28, in decorate_context
return func(*args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/torch/optim/adam.py", line 66, in step
loss = closure()
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 235, in _training_step_and_backward_closure
result = self.training_step_and_backward(split_batch, batch_idx, opt_idx, optimizer, hiddens)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 545, in training_step_and_backward
self.backward(result, optimizer, opt_idx)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/loops/batch/training_batch_loop.py", line 586, in backward
result.closure_loss = self.trainer.accelerator.backward(result.closure_loss, optimizer, *args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/accelerators/accelerator.py", line 276, in backward
self.precision_plugin.backward(self.lightning_module, closure_loss, *args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/plugins/precision/precision_plugin.py", line 78, in backward
model.backward(closure_loss, optimizer, *args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/pytorch_lightning/core/lightning.py", line 1465, in backward
loss.backward(*args, **kwargs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/torch/_tensor.py", line 255, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/injaureg/Data/Desktop/py36_jttl/lib64/python3.6/site-packages/torch/autograd/__init__.py", line 149, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: Function ViewBackward returned an invalid gradient at index 0 - expected type TensorOptions(dtype=float, device=cuda:0, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt)) but got TensorOptions(dtype=float, device=cpu, layout=Strided, requires_grad=false (default), pinned_memory=false (default), memory_format=(nullopt))
It seems that I have some gradients on GPU and others in the CPU. The forward pass works well and if I train the model on CPU there is no error either. I have checked carefully the device of all the tensors involved in the forward pass, and it seems they are all correctly on GPU (when GPU is selected).
I was wondering if someone could give me a clue on what I am missing here. Is there a particular torch operation that “silently” occurs in CPU instead of GPU? Is there a way to better debug this error? The message above doesn’t indicate which variable is in CPU during the backward pass.
Thank you in advance!