Hi,
I’ve encountered error in title:
Traceback (most recent call last):
File "train.py", line 171, in <module>
main()
File "train.py", line 122, in main
training_losses, validation_losses, lr_rates = trainer.run_trainer()
File "/content/gsn/utils/trainer.py", line 67, in run_trainer
self._train()
File "/content/gsn/utils/trainer.py", line 141, in _train
loss.backward(retain_graph=True)
File "/usr/local/lib/python3.7/dist-packages/torch/tensor.py", line 245, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File "/usr/local/lib/python3.7/dist-packages/torch/autograd/__init__.py", line 147, in backward
allow_unreachable=True, accumulate_grad=True) # allow_unreachable flag
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1024, 512, 3, 3]] is at version 2; expected version 1 instead.
Anomaly mode suggests that probelmatic is line:
out, states = self.model(input, self.last_states_for_each_channel)
Rest of training function for context:
def _train(self):
if self.notebook:
from tqdm.notebook import tqdm, trange
else:
from tqdm import tqdm, trange
self.model.train() # train mode
train_losses = [] # accumulate the losses here
batch_iter = tqdm(enumerate(self.training_DataLoader), 'Training', total=len(self.training_DataLoader),
leave=False)
iter = 0
torch.autograd.set_detect_anomaly(True)
for i, (x, y) in batch_iter:
#train_losses, loss_value = self.sample_learn(x, y)
y = y.unsqueeze(dim=0)
input, target = x.to(self.device), y.to(self.device) # send to device (GPU or CPU)
input = self.event_preprocessor(input)
input = self.crop.pad(input)
target = self.crop.pad(target)
self.optimizer.zero_grad() # zerograd the parameters
with CudaTimer('Inference'):
out, states = self.model(input, self.last_states_for_each_channel) # one forward pass
if self.no_recurrent:
self.last_states_for_each_channel = None
else:
self.last_states_for_each_channel = states
# target = target.type(torch.long)
loss = self.criterion(out, target) # calculate loss
loss_value = loss.item()
train_losses.append(loss_value)
loss.backward(retain_graph=True)
self.optimizer.step()
# if iter == 0:
# self.optimizer.step()
# loss.backward(retain_graph=True) # one backward pass
# iter+=1
# else:
# loss.backward(retain_graph=True)
# self.optimizer.step()
# update the parameters
batch_iter.set_description(f'Training: (loss {loss_value:.4f})') # update progressbar
self.training_loss.append(np.mean(train_losses))
self.learning_rate.append(self.optimizer.param_groups[0]['lr'])
batch_iter.close()