Hi there!
I’m working on a simple NN convolutional model and I am encountering the following error : “RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation:”.
Traceback (most recent call last):
File "main.py", line 39, in <module>
central_server.startFL()
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 321, in startFL
self.train_federated_model()
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 266, in train_federated_model
selected_total_size = workhorse.map(self.mp_update_selected_clients, sampled_client_indices)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 771, in get
raise self._value
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 210, in mp_update_selected_clients
self.clients[selected_index].client_update()
File "/home/wise/miri/FederatedLearningResearch/Client.py", line 55, in client_update
loss.backward()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/_tensor.py", line 363, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [64, 32, 5, 5]] is at version 3; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
I can’t find the inplace operation in my code. I also tried clone()
and nn.ReLU(inplace=False)
but it doesn’t fix the error.
I don’t know which line to fix.
with torch.autograd.set_detect_anomaly(True)
shows detected errors like that :
/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/autograd/__init__.py:173: UserWarning: Error detected in ConvolutionBackward0. Traceback of forward call that caused the error:
File "/home/wise/anaconda3/envs/FL/lib/python3.8/threading.py", line 890, in _bootstrap
self._bootstrap_inner()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 210, in mp_update_selected_clients
self.clients[selected_index].client_update()
File "/home/wise/miri/FederatedLearningResearch/Client.py", line 53, in client_update
outputs = self.model(data)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/wise/miri/FederatedLearningResearch/models.py", line 48, in forward
x = self.activation(self.conv1(x))
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 447, in forward
return self._conv_forward(input, self.weight, self.bias)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/nn/modules/conv.py", line 443, in _conv_forward
return F.conv2d(input, weight, bias, self.stride,
(Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/autograd/__init__.py:173: UserWarning: Error detected in LogSoftmaxBackward0. Traceback of forward call that caused the error:
File "/home/wise/anaconda3/envs/FL/lib/python3.8/threading.py", line 890, in _bootstrap
self._bootstrap_inner()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/threading.py", line 932, in _bootstrap_inner
self.run()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/threading.py", line 870, in run
self._target(*self._args, **self._kwargs)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 210, in mp_update_selected_clients
self.clients[selected_index].client_update()
File "/home/wise/miri/FederatedLearningResearch/Client.py", line 54, in client_update
loss = eval(self.criterion)()(outputs, labels)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1110, in _call_impl
return forward_call(*input, **kwargs)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 1163, in forward
return F.cross_entropy(input, target, weight=self.weight,
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/nn/functional.py", line 2996, in cross_entropy
return torch._C._nn.cross_entropy_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index, label_smoothing)
(Triggered internally at ../torch/csrc/autograd/python_anomaly_mode.cpp:104.)
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
File "main.py", line 39, in <module>
central_server.startFL()
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 321, in startFL
self.train_federated_model()
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 266, in train_federated_model
selected_total_size = workhorse.map(self.mp_update_selected_clients, sampled_client_indices)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 364, in map
return self._map_async(func, iterable, mapstar, chunksize).get()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 771, in get
raise self._value
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 125, in worker
result = (True, func(*args, **kwds))
File "/home/wise/anaconda3/envs/FL/lib/python3.8/multiprocessing/pool.py", line 48, in mapstar
return list(map(*args))
File "/home/wise/miri/FederatedLearningResearch/Server.py", line 210, in mp_update_selected_clients
self.clients[selected_index].client_update()
File "/home/wise/miri/FederatedLearningResearch/Client.py", line 55, in client_update
loss.backward()
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/_tensor.py", line 363, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/home/wise/anaconda3/envs/FL/lib/python3.8/site-packages/torch/autograd/__init__.py", line 173, in backward
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [32, 1, 5, 5]] is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
My trainnig code(Client.py):
def client_update(self):
"""Update local model using local dataset."""
self.model.train()
self.model.to(self.device)
optimizer = eval(self.optimizer)(self.model.parameters(), **self.optim_config)
for e in range(self.local_epoch):
for data, labels in self.dataloader:
with torch.autograd.set_detect_anomaly(True) :
data,labels = data.float().to(self.device), labels.long().to(self.device)
self.model.to(self.device)
optimizer.zero_grad()
outputs = self.model(data)
loss = eval(self.criterion)()(outputs, labels)
loss.backward()
optimizer.step()
if self.device == "cuda" : torch.cuda.empty_cache()
self.model.to("cpu")
My model code(models.py):
class CNN(nn.Module) :
def __init__ (self, name, in_channels, hidden_channels, num_hiddens, num_classes) :
super(CNN, self).__init__()
self.name = name
self.activation = nn.ReLU(inplace=False)
self.conv1 = nn.Conv2d(in_channels=in_channels, out_channels=hidden_channels, kernel_size=(5,5), padding=1, stride=1, bias=False)
self.conv2 = nn.Conv2d(in_channels=hidden_channels, out_channels=hidden_channels*2, kernel_size=(5,5), padding=1, stride=1, bias=False)
self.maxpool1 = nn.MaxPool2d(kernel_size=(2,2), padding=1)
self.maxpool2 = nn.MaxPool2d(kernel_size=(2,2), padding=1)
self.flatten = nn.Flatten()
self.fc1 = nn.Linear(in_features=(hidden_channels*2)*(7*7), out_features=num_hiddens, bias=False)
self.fc2 = nn.Linear(in_features=num_hiddens, out_features=num_classes, bias=False)
def forward(self, x) :
x = self.activation(self.conv1(x))
x = self.maxpool1(x)
x = self.activation(self.conv2(x))
x = self.maxpool2(x)
x = self.flatten(x)
x = self.activation(self.fc1(x))
x = self.fc2(x)
return x
I really don’t know what the problem is
Do you know how to resolve it?