I am using torch 1.13.0 + cu116 with huggingface accelerate to use ddp to train a model.
A module is defined as follows:
class Conv1d(nn.Module):
def __init__(self, cin, cout, kernel_size, stride, padding, residual=False, *args, **kwargs):
super().__init__(*args, **kwargs)
self.conv = nn.Conv1d(cin, cout, kernel_size, stride, padding)
self.bn = nn.BatchNorm1d(cout, track_running_stats=True)
self.act = nn.LeakyReLU(0.2, inplace=False)
self.residual = residual
def forward(self, x):
out = self.conv(x.clone())
out = self.bn(out)
if self.residual:
out = out + x
return self.act(out)
The module defined above is a block to be integrated in a larger model.
Here is my observation:
- An error will occur regarding in-place operation when calling
loss.backward()
duing training. - Commenting out the batchnorm in forward pass suppresses the error.
- set
track_running_stats=False
when initialize nn.BatchNorm1d also suppress the error.
However, the bypass trick could affect the performance of model to be trained.
I would appreciate it if someone could help me with this problem.
Thanks in advance.
I am involving some track back as follows:
UserWarning: Error detected in Cu
dnnBatchNormBackward0. Traceback of forward call that caused the error:
File "/ext/Qifeng/GeneFace/tasks/run.py", line 19, in <module>
run_task()
File "/ext/Qifeng/GeneFace/tasks/run.py", line 14, in run_task
task_cls.start()
File "/ext/Qifeng/GeneFace/utils/commons/base_task.py", line 251, in start
trainer.fit(cls)
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 130, in fit
self.run_single_process(self.task)
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 203, in run_single_process
self.train()
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 317, in train
pbar_metrics, tb_metrics = self.run_training_batch(batch_idx, batch)
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 368, in run_training_batch
output = task_ref.training_step(*args)
File "/ext/Qifeng/GeneFace/utils/commons/base_task.py", line 109, in training_step
loss_ret = self._training_step(sample, batch_idx, optimizer_idx)
File "/ext/Qifeng/GeneFace/tasks/postnet/lm3d_postnet_adv_sync_pitch.py", line 183, in _training_step
loss_output, model_out = self.run_model(sample)
File "/ext/Qifeng/GeneFace/tasks/postnet/lm3d_postnet_adv_sync_pitch.py", line 149, in run_model
refine_pred_lm3d_for_person_ds = self.model(raw_pred_lm3d_for_person_ds, pitch_for_person_ds) * person_batch['y_mask'].unsqueeze(-1)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1040, in forward
output = self._run_ddp_forward(*inputs, **kwargs)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/parallel/distributed.py", line 1000, in _run_ddp_forward
return module_to_run(*inputs[0], **kwargs[0])
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/ext/Qifeng/GeneFace/modules/postnet/models.py", line 136, in forward
diff_x = self.block3(transposed_diff_x) #.transpose(1, 2)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/modules/container.py", line 204, in forward
input = module(input)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/ext/Qifeng/GeneFace/modules/postnet/models.py", line 31, in forward
out = self.bn(out)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/modules/module.py", line 1190, in _call_impl
return forward_call(*input, **kwargs)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/modules/batchnorm.py", line 171, in forward
return F.batch_norm(
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/nn/functional.py", line 2450, in batch_norm
return torch.batch_norm(
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/fx/traceback.py", line 57, in format_stack
return traceback.format_stack()
(Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:114.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "/ext/Qifeng/GeneFace/tasks/run.py", line 19, in <module>
run_task()
File "/ext/Qifeng/GeneFace/tasks/run.py", line 14, in run_task
task_cls.start()
File "/ext/Qifeng/GeneFace/utils/commons/base_task.py", line 251, in start
trainer.fit(cls)
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 130, in fit
self.run_single_process(self.task)
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 203, in run_single_process
self.train()
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 317, in train
pbar_metrics, tb_metrics = self.run_training_batch(batch_idx, batch)
File "/ext/Qifeng/GeneFace/utils/commons/trainer.py", line 383, in run_training_batch
hparams['acc'].backward(loss)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/accelerate/accelerator.py", line 1989, in backward
loss.backward(**kwargs)
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/_tensor.py", line 487, in backward
torch.autograd.backward(
File "/home/qw/anaconda3/envs/gf/lib/python3.9/site-packages/torch/autograd/__init__.py", line 197, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [128]] i
s at version 3; expected version 2 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The var
iable in question was changed in there or anywhere later. Good luck!