Distributed training of the network shown below gives this error, while without distributed works.
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [12, 256, 1, 1]] is at version 2; expected version 1 instead.
[W python_anomaly_mode.cpp:60] Warning: Error detected in CudnnConvolutionBackward. Traceback of forward call that caused the error:
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/threading.py", line 890, in _bootstrap
self._bootstrap_inner()
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/threading.py", line 926, in _bootstrap_inner
self.run()
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/site-packages/torch/nn/parallel/parallel_apply.py", line 60, in _worker
output = module(*input, **kwargs)
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/net.py", line 51, in forward
out1 = self.conv2(out0.clone())
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/site-packages/torch/nn/modules/module.py", line 722, in _call_impl
result = self.forward(*input, **kwargs)
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 419, in forward
return self._conv_forward(input, self.weight)
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/site-packages/torch/nn/modules/conv.py", line 416, in _conv_forward
self.padding, self.dilation, self.groups)
(function print_stack)
Process Process-1:
Traceback (most recent call last):
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/multiprocessing/process.py", line 297, in _bootstrap
self.run()
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/multiprocessing/process.py", line 99, in run
self._target(*self._args, **self._kwargs)
File "/home/e2r/Desktop/e2r/train.py", line 51, in init_process
fn(rank, opt)
File "/home/e2r/Desktop/e2r/train.py", line 194, in main_worker
trainer.train()
File "/home/e2r/Desktop/e2r//trainer.py", line 166, in train
self.run_epoch()
File "/home/e2r/Desktop/e2r/trainer.py", line 199, in run_epoch
losses["loss"].backward()
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/site-packages/torch/tensor.py", line 185, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/e2r/anaconda3/envs/e2r/lib/python3.7/site-packages/torch/autograd/__init__.py", line 127, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [12, 256, 1, 1]] is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Enabling anamoly detection traces it back to a 1 x 1 convolution layer. I have tried many things to resolve it such as clone()
before giving the layer as input and also having inplace=False
in Relu layers. I still cannot resolve it. Here’s the network
class Net(nn.Module):
def __init__(self, num_ch_enc, num_input_features, num_frames_to_predict_for=None, stride=1):
super(Net, self).__init__()
self.num_ch_enc = num_ch_enc
self.num_input_features = num_input_features
if num_frames_to_predict_for is None:
num_frames_to_predict_for = num_input_features - 1
self.num_frames_to_predict_for = num_frames_to_predict_for
num_frames_to_predict_for_6 = int(6*self.num_frames_to_predict_for)
self.squeeze = nn.Conv2d(self.num_ch_enc[-1], 256, 1)
self.conv0 = nn.Conv2d(num_input_features * 256, 256, 3, stride, 1)
self.conv1 = nn.Conv2d(256, 256, 3, stride, 1)
self.conv2 = nn.Conv2d(256, num_frames_to_predict_for_6, 1, 1, 0)
self.relu = nn.ReLU(inplace=False)
def forward(self, input_features):
last_features = [f[-1] for f in input_features]
cat_features = [self.relu(self.squeeze(f)) for f in last_features]
cat_features = torch.cat(cat_features, 1)
out = cat_features
out = self.conv0(out)
out = self.relu(out)
out = self.conv1(out)
out = self.relu(out)
out = self.conv1(out)
out0 = self.relu(out)
# gives inplace modification error here for multiprocessing for conv2 layer
out1 = self.conv2(out0.clone())
out2 = out1.mean(3).clone()
out3 = out2.mean(2).clone()
out4 = 0.01 * out3.clone().view(-1, self.num_frames_to_predict_for, 1, 6)
axisangle = out4[..., :3]
translation = out4[..., 3:]
return out4[..., :3], out4[..., 3:]
Edit:
I tried to simplify the forward function as follows but I still get an error at the linear layer
def forward(self, x):
x = x[-1][-1]
x = self.pose3(x)
x = x.view(-1, self.num_frames_to_predict_for * 6* 6* 20)
# Same errror in the linear layer here
x1 = self.linear(x).clone()
x2 = 0.01 * x1.view(-1, self.num_frames_to_predict_for, 1, 6)
return x2[..., :3], x2[..., 3:]