Hi everyone, hope you are having a great time.
I recently faced this issue, where back in 1.8 or 1.9 I had to train a model with dropout’s inplace=Flase
or otherwise it would crash, now in 1.11 and 1.13.1 I cant seem to flip that back to True and seems I’m stuck with dropout inplace=False
! if I set that to True, I get this error:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.HalfTensor [256, 256, 11, 11]], which is output 0 of ReluBackward0, is at version 2; expected version 1 instead.
the full stacktrace looks like this:
/home/hossein/anaconda3/lib/python3.9/site-packages/torch/autograd/__init__.py:197: UserWarning: Error detected in ReluBackward0. Traceback of forward call that caused the error:
File "/home/hossein/pytorch-image-models/train.py", line 806, in <module>
main()
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/hossein/pytorch-image-models/train.py", line 603, in main
train_metrics = train_one_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args,
File "/home/hossein/pytorch-image-models/train.py", line 670, in train_one_epoch
output = model(input)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hossein/pytorch-image-models/timm/models/simplenet.py", line 210, in forward
out = self.features(x)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/nn/modules/container.py", line 204, in forward
input = module(input)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1194, in _call_impl
return forward_call(*input, **kwargs)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/nn/modules/activation.py", line 102, in forward
return F.relu(input, inplace=self.inplace)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/nn/functional.py", line 1455, in relu
result = torch.relu_(input)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/fx/traceback.py", line 57, in format_stack
return traceback.format_stack()
(Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:114.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/home/hossein/pytorch-image-models/train.py", line 806, in <module>
main()
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/hossein/pytorch-image-models/train.py", line 603, in main
train_metrics = train_one_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args,
File "/home/hossein/pytorch-image-models/train.py", line 678, in train_one_epoch
loss_scaler(loss, optimizer,
File "/home/hossein/pytorch-image-models/timm/utils/cuda.py", line 43, in __call__
self._scaler.scale(loss).backward(create_graph=create_graph)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/_tensor.py", line 488, in backward
torch.autograd.backward(
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.HalfTensor [256, 256, 11, 11]], which is output 0 of ReluBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
ERROR:torch.distributed.elastic.multiprocessing.api:failed (exitcode: 1) local_rank: 0 (pid: 1752876) of binary: /home/hossein/anaconda3/bin/python3
Traceback (most recent call last):
File "/home/hossein/anaconda3/lib/python3.9/runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/hossein/anaconda3/lib/python3.9/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/launch.py", line 195, in <module>
main()
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/launch.py", line 191, in main
launch(args)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/launch.py", line 176, in launch
run(args)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/run.py", line 753, in run
elastic_launch(
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 132, in __call__
return launch_agent(self._config, self._entrypoint, list(args))
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/launcher/api.py", line 246, in launch_agent
raise ChildFailedError(
torch.distributed.elastic.multiprocessing.errors.ChildFailedError:
============================================================
train.py FAILED
------------------------------------------------------------
Failures:
<NO_OTHER_FAILURES>
------------------------------------------------------------
Root Cause (first observed failure):
[0]:
time : 2023-02-04_13:42:08
host : hossein-pc
rank : 0 (local_rank: 0)
exitcode : 1 (pid: 1752876)
error_file: /tmp/torchelastic_40eggmop/none_e2lzr3bg/attempt_0/0/error.json
traceback : Traceback (most recent call last):
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/distributed/elastic/multiprocessing/errors/__init__.py", line 346, in wrapper
return f(*args, **kwargs)
File "/home/hossein/pytorch-image-models/train.py", line 603, in main
train_metrics = train_one_epoch(epoch, model, loader_train, optimizer, train_loss_fn, args,
File "/home/hossein/pytorch-image-models/train.py", line 678, in train_one_epoch
loss_scaler(loss, optimizer,
File "/home/hossein/pytorch-image-models/timm/utils/cuda.py", line 43, in __call__
self._scaler.scale(loss).backward(create_graph=create_graph)
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/_tensor.py", line 488, in backward
torch.autograd.backward(
File "/home/hossein/anaconda3/lib/python3.9/site-packages/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.HalfTensor [256, 256, 11, 11]], which is output 0 of ReluBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
============================================================
As for the model, its a plain cnn(vgg like) with nothing fancy about. just good old cnn!
whats going on and how can I get rid of this annoyance?
Thanks a lot in advance