Backpropagating multiple losses

I am training model 1 (using train1) with a specific loss function that involves tensor A. I am accumulating the loss and then want to perform an update. Next I am training a second model 2 (train2) in which I want to calculate the gradients wrt A using the loss calculated in train2. Thus I am adding loss 1 to loss2.

#reproduce error
from transformers import BertModel, BertForMaskedLM, BertConfig, EncoderDecoderModel
import torch
import torch.nn.functional as F
model1 = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints
model2 = EncoderDecoderModel.from_encoder_decoder_pretrained('bert-base-uncased', 'bert-base-uncased') # initialize Bert2Bert from pre-trained checkpoints

optimizer1 = torch.optim.Adam(model1.parameters(), lr=0.001)
A=torch.rand(1, requires_grad=True)
optimizer3 = torch.optim.SGD([A], lr=0.1)

en_input=torch.tensor([[1,2], [3,4]])
en_masks=torch.tensor([[0,0], [0,0]])
de_output=torch.tensor([[3,1], [4,2]])
de_masks=torch.tensor([[0,0], [0,0]])
lm_labels=torch.tensor([[5,7], [6,8]])


def train1():
  for i in range(2):
    out = model1(input_ids=en_input, attention_mask=en_masks, decoder_input_ids=de_output, 
                        decoder_attention_mask=de_masks, labels=lm_labels.clone())

    prediction_scores = out[1]
    predictions = F.log_softmax(prediction_scores, dim=2)
    p=((predictions.sum() - de_output.sum())*A).sum()
    p=torch.unsqueeze(p, dim=0)
    acc =,acc)) # accumulating the loss 

  return loss

def train2(loss1):
for i in range (2):
   output = model2(input_ids=en_input, attention_mask=en_masks, 
                      decoder_attention_mask=de_masks, labels=lm_labels.clone())
   prediction_scores_ = output[1]
   predictions_= F.log_softmax(prediction_scores_, dim=2)
   loss2=((predictions_.sum() - de_output.sum())).sum()+loss1 # want to calculate gradients 
 wrt A
   loss2.backward(inputs=[A], retain_graph=True) 
   optimizer3.step() #update A based on calculated gradients


If this is the right method, I am not understanding whats wrong in my code? If its not right, I would appreciate if someone pointed me in the right direction.

error trace

/usr/local/lib/python3.7/dist-packages/torch/autograd/ UserWarning: Error detected in MulBackward0. Traceback of forward call that caused the error:
  File "/usr/lib/python3.7/", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/usr/lib/python3.7/", line 85, in _run_code
    exec(code, run_globals)
  File "/usr/local/lib/python3.7/dist-packages/", line 16, in <module>
  File "/usr/local/lib/python3.7/dist-packages/traitlets/config/", line 845, in launch_instance
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/", line 499, in start
  File "/usr/local/lib/python3.7/dist-packages/tornado/platform/", line 132, in start
  File "/usr/lib/python3.7/asyncio/", line 541, in run_forever
  File "/usr/lib/python3.7/asyncio/", line 1786, in _run_once
  File "/usr/lib/python3.7/asyncio/", line 88, in _run, *self._args)
  File "/usr/local/lib/python3.7/dist-packages/tornado/platform/", line 122, in _handle_events
    handler_func(fileobj, events)
  File "/usr/local/lib/python3.7/dist-packages/tornado/", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/zmq/eventloop/", line 451, in _handle_events
  File "/usr/local/lib/python3.7/dist-packages/zmq/eventloop/", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/usr/local/lib/python3.7/dist-packages/zmq/eventloop/", line 434, in _run_callback
    callback(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/tornado/", line 300, in null_wrapper
    return fn(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/usr/local/lib/python3.7/dist-packages/ipykernel/", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/", line 2718, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/", line 2822, in run_ast_nodes
    if self.run_code(code, result):
  File "/usr/local/lib/python3.7/dist-packages/IPython/core/", line 2882, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-66-c603f915c713>", line 78, in <module>
  File "<ipython-input-66-c603f915c713>", line 25, in train1
    p=((predictions.sum() - de_output.sum())*A).sum()
 (Triggered internally at  /pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
  allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag
RuntimeError                              Traceback (most recent call last)
<ipython-input-66-c603f915c713> in <module>()
     77 for i in range(2):
     78   loss1=train1()
---> 79   train2(loss1)

2 frames
<ipython-input-66-c603f915c713> in train2(loss1)
     69     print(A.grad)
     70     #loss2.grad(inputs=A,outputs=A, only_inputs=True)
---> 71     loss2.backward(inputs=[A],retain_graph=True) #calculates gradients # retain_graph=True #list(dec.parameters())
     72     print(A.grad)
     73     # torch.nn.utils.clip_grad_norm_(model1.parameters(), 1.0)

/usr/local/lib/python3.7/dist-packages/torch/ in backward(self, gradient, retain_graph, create_graph, inputs)
    243                 create_graph=create_graph,
    244                 inputs=inputs)
--> 245         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    247     def register_hook(self, hook):

/usr/local/lib/python3.7/dist-packages/torch/autograd/ in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    145     Variable._execution_engine.run_backward(
    146         tensors, grad_tensors_, retain_graph, create_graph, inputs,
--> 147         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1]] is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Based on the description it seems you are trying to use stale intermediate activations to calculate the gradients for already updated parameters, which would raise this error.
This post explains the issue in more detail using a GAN training approach.

hey @ptrblck,
Thank you so much for replying :slight_smile:. I understood my mistake. I had another ques, Is it possible for me to calculate the gradients wrt A, if I don’t add loss1 to loss2 and simply do a loss2.backward(inputs=[A]) ? Thanks.

That might be possible, as it seems A is used in the loss calculation and might not be using the aforementioned stale activations. In any case, you could just run the code and see, if Autograd would raise an error.

Well, I did try running the code without it. There is no change in the gradients of A.