[Solved][Pytorch1.5] RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation

Hi,

Indeed, the only way to get something similar with earlier releases is to use autograd.grad() and then populate the .grad fields manually with the gradient it returned.

1 Like

I met the same problem as you, I passed the two inputs to the backbone, the error appeared.
But when there is only one input, the error disappears .May I ask how you solve this problem?
This is my code:
inputs = inputs.cuda(cfg[‘GPU’], non_blocking=True)
labels = labels.cuda(cfg[‘GPU’], non_blocking=True)
inputs_ = inputs_.cuda(cfg[‘GPU’], non_blocking=True)
labels_ = labels_.cuda(cfg[‘GPU’], non_blocking=True)
features = backbone(inputs)
features_ = backbone(inputs_)
outputs = head(features, labels)
outputs_ = head(features_, labels_)
lossx = loss(outputs, labels) + loss(outputs_, labels_)
optimizer.zero_grad()
lossx.backward()
optimizer.step()

Thanks! It worked in my codes.
But, I have a question for this. Isn’t there any effects from action_loss to value_optimizer?
I mean, in my minor knowledge, I didn’t understand why the computational graph of
“value_loss → action_loss → value_optimizer → action_optimizer” have each correct optimizing value.

Could you anyone help me and explain this shortly? thanks.

Thanks for all wonderful discussions.

I just wanted to confirm that the following two solutions would give the same and right optimization results (pytorch 1.9.0).

In such a pipeline:

optim1 = optim.Adam(G.parameters())
optim2 = optim.Adam(D.parameters())
G = Model1()
D = Model2()
recons, z = G(input)
loss1 = loss_func1(recons)
diff = D(z)
loss2 = loss_func2(diff)
loss3 = loss_func3(diff)
loss_G = loss1 + loss2 # we don’t want to update D parameters here
loss_D = loss3

Solution #1

optim1.zero_grad()
loss_G.backward(retain_graph=True)
optim2.zero_grad()
loss_D.backward()
optim1.step()
optim2.step()

Solution #2

optim1.zero_grad()
loss_G.backward(retain_graph=True, inputs=list(G.parameters()))
optim1.step()
optim2.zero_grad()
loss_D.backward(inputs=list(D.parameters()))
optim2.step()

Both of the solutions come from the previous solutions. Thanks again.

1 Like

I am facing the same issue and has beem stuck for 1 day
here my code

class RNN(nn.Module):
    
    def __init__(self,input_size, output_size, hidden_size=64):

        super().__init__()

        self.input_size  = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.xh = nn.Linear(self.input_size, self.hidden_size, bias=False)


        self.hh = nn.Linear(self.hidden_size, self.hidden_size)
        self.hy = nn.Linear(self.hidden_size, self.output_size)
        
        self.h = torch.zeros(self.hidden_size, requires_grad=True)
        
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.sigmoid = nn.Sigmoid()

    def rnn_cell(self, x_t):  
        
        first_h = self.hh(self.h)
        
        second_x = self.xh(x_t)

        act = second_x + first_h
        
        self.h = self.tanh(act)

        updated_c = self.sigmoid(self.hy(self.h))

        return updated_c


    def forward(self, inp):
        return self.rnn_cell(inp)

here is training code

def train(train_x,  valid_x, lr, epochs, hidden_units, net='RNN'):
    
    for step, (data, label) in enumerate(train_x):
        inputs = np.array(data)
        break

    if net=='RNN':
        model = RNN(inputs.shape[1], 1, hidden_units)
    elif net == 'LSTM':
        h = torch.zeros(hidden_units).requires_grad_()
        c = torch.zeros(hidden_units).requires_grad_()
        model = LSTM(inputs.shape[1], 1, hidden_units)
    elif net == 'GRU':
        St_1 = torch.zeros(hidden_units).requires_grad_()
        model = GRUModel(inputs.shape[1], 1, hidden_units)
    model.to(device)
    
    
    train_loss, val_loss = [],[]
    train_accuracy, val_accuracy = [], []
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    criterion = nn.BCELoss()

    
    
    for ep in range(epochs):
        running_loss, correct = 0, 0
        for i, (data, label) in enumerate(train_x):
            data, label = Variable(data), Variable(label)
            data, label = data.to(device), label.to(device)
            
            optimizer.zero_grad()
            
            if net == 'RNN':
                net_out= model(data)
            elif net == 'LSTM':
                net_out, h, c = model(data, h, c)
            elif net == 'GRU':
                net_out, St_1 = model(data, St_1)
                
            
            label = torch.reshape(label, (label.shape[0], 1))
            net_out = torch.reshape(net_out, (label.shape[0], 1))
            label = label.float()
            loss = criterion(net_out, label)
            loss.backward(retain_graph=True, inputs=list(model.parameters()))
            optimizer.step()

            running_loss += loss.item()
#             pred = torch.argmax(net_out, axis=1)  # get the index of the max log-probability
#             actual = torch.argmax(label, axis=1)
            out = (net_out>0.5).float()
            correct += out.eq(label).sum()



        print(running_loss)
        print("Epoch:", ep)
        print(correct.item())
        print("Training Accuracy:", 100. * correct.item() / len(train_x.dataset))
        print("Train Loss:", running_loss / len(train_x.dataset))
        train_loss.append(running_loss / len(train_x.dataset))
        train_accuracy.append(correct / len(train_x.dataset))


#         test_loss = 0
#         correct = 0
#         with torch.no_grad():
#             for batch_idx, (data, target) in enumerate(valid_x):
#                 data, target = Variable(data), Variable(target)
#                 data, target = data.to(device), target.to(device)
#     #                 data = data.view(-1, 784)
#                 if net == 'RNN':
#                     net_out, _ = model(data, h)
#                 elif net == 'LSTM':
#                     net_out, _, _ = model(data, h, c)
#                 elif net == 'GRU':
#                     net_out, _ = model(data, St_1)
#                 net_out = torch.reshape(net_out, (net_out.shape[0],))
#                 # sum up batch loss
#                 target = target.float()
#                 test_loss += criterion(net_out, target).item()
#     #                 pred = torch.argmax(net_out, axis=1)  # get the index of the max log-probability
#     #                 actual = torch.argmax(label, axis=1)
#                 out = (net_out>0.5).float()
#                 correct += out.eq(target).sum()
#             val_loss.append(test_loss / len(valid_x.dataset))
#             val_accuracy.append(correct / len(valid_x.dataset))

#         print("Validation Accuracy:" , 100. * correct.item() / len(valid_x.dataset))
#         print("Validation Loss:", test_loss / len(valid_x.dataset)) 
#         print("----------------------------------------------------------")
    
    return model, train_loss, train_accuracy, val_loss, val_accuracy
        
        
    
    

Here is the error list:

/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/autograd/__init__.py:147: UserWarning: Error detected in MmBackward. Traceback of forward call that caused the error:
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
    app.start()
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 619, in start
    self.io_loop.start()
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
    self._run_once()
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
    handle._run()
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/asyncio/events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/ioloop.py", line 688, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
    ret = callback()
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 814, in inner
    self.ctx_run(self.run)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 358, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 536, in execute_request
    self.do_execute(
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 302, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 539, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
    result = self._run_cell(
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
    return runner(coro)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-61-f6fbdf7371e3>", line 2, in <module>
    net, train_loss, train_accuracy, val_loss, val_accuracy = train(train_loader,  valid_loader, lr=0.0001, epochs=10,  hidden_units=64, net='RNN')
  File "<ipython-input-60-3bb8b3924f63>", line 35, in train
    net_out= model(data)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "<ipython-input-59-cdbb099f3af3>", line 39, in forward
    return self.rnn_cell(inp)
  File "<ipython-input-59-cdbb099f3af3>", line 25, in rnn_cell
    first_h = self.hh(self.h)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1051, in _call_impl
    return forward_call(*input, **kwargs)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 96, in forward
    return F.linear(input, self.weight, self.bias)
  File "/Users/arslan/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/nn/functional.py", line 1847, in linear
    return torch._C._nn.linear(input, weight, bias)
 (Triggered internally at  /Users/distiller/project/conda/conda-bld/pytorch_1623459044803/work/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
  Variable._execution_engine.run_backward(
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-61-f6fbdf7371e3> in <module>
      1 torch.autograd.set_detect_anomaly(True)
----> 2 net, train_loss, train_accuracy, val_loss, val_accuracy = train(train_loader,  valid_loader, lr=0.0001, epochs=10,  hidden_units=64, net='RNN')

<ipython-input-60-3bb8b3924f63> in train(train_x, valid_x, lr, epochs, hidden_units, net)
     44             label = label.float()
     45             loss = criterion(net_out, label)
---> 46             loss.backward(retain_graph=True, inputs=list(model.parameters()))
     47             optimizer.step()
     48 

~/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/_tensor.py in backward(self, gradient, retain_graph, create_graph, inputs)
    253                 create_graph=create_graph,
    254                 inputs=inputs)
--> 255         torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
    256 
    257     def register_hook(self, hook):

~/anaconda3/envs/torch_env/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables, inputs)
    145         retain_graph = create_graph
    146 
--> 147     Variable._execution_engine.run_backward(
    148         tensors, grad_tensors_, retain_graph, create_graph, inputs,
    149         allow_unreachable=True, accumulate_grad=True)  # allow_unreachable flag

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [64, 64]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

can you @ptrblck @albanD please help me I’ll be very thankful to all of you

I guess the error is raised, since you are not detaching the hidden state, are retaining the graph, and updating the parameters in each iteration. This would cause the intermediate forward activations from the previous iterations be become “stale” and the gradient computation would fail.
I don’t know what your exact use case is, but you might want to detach() the hidden and cell states in each iteration and remove retain_graph=True as well.

1 Like

I solve the problem by deleting the “inplace=True” for Torch 1.7

please , could you give me more details?
where did you put ‘inplace=True’?

Hey, has this issue been solved? I am having the same issue in exactly a similar scenario. If it is solved, could you please suggest the solution?

Thanks Jeff. It worked for me.

1 Like

I’m getting this error at loss.backward(). I don’t know what is this error. I’m using pytorch 1.10.1.

Exception has occurred: RuntimeError
one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [48, 2048]], which is output 0 of ReluBackward0, is at version 1; expected version 0 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
  File "/home/ma/Downloads/Thesis/main_top_down_baseline.py", line 45, in train
    loss.backward().clone()
  File "/home/ma/Downloads/Thesis/main_top_down_baseline.py", line 253, in main
    train(model, train_loader, dev_loader, optimizer, scheduler, n_epoch, args.output_dir, encoder, args.gpuid, clip_norm, model_name, args.model_saving_name,
  File "/home/ma/Downloads/Thesis/main_top_down_baseline.py", line 262, in <module>
    main()

This is model which I’m using

'''
This is the baseline model.
We directly use top-down VQA like mechanism for SR
Modified bottom-up top-down code from https://github.com/hengyuan-hu/bottom-up-attention-vqa and
added normalization from https://github.com/yuzcccc/vqa-mfb
'''

import torch
import torch.nn as nn
import torch.nn.functional as F
import sys
torch.autograd.set_detect_anomaly(True)
sys.path.append('/home/ma/Downloads/Thesis')
from lib.attention import Attention
from lib.classifier import SimpleClassifier
from lib.fc import FCNet
import torchvision as tv

class vgg16_modified(nn.Module):
    def __init__(self):
        super(vgg16_modified, self).__init__()
        vgg = tv.models.vgg16_bn(pretrained=True)
        self.vgg_features = vgg.features

    def forward(self,x):
        features = self.vgg_features(x)
        return features

class Top_Down_Baseline(nn.Module):
    def __init__(self, convnet, role_emb, verb_emb, query_composer, v_att, q_net, v_net, classifier, encoder, Dropout_C):
        super(Top_Down_Baseline, self).__init__()
        self.convnet = convnet
        self.role_emb = role_emb
        self.verb_emb = verb_emb
        self.query_composer = query_composer
        self.v_att = v_att
        self.q_net = q_net
        self.v_net = v_net
        self.classifier = classifier
        self.encoder = encoder
        self.Dropout_C = Dropout_C

    def forward(self, v_org, gt_verb):
        '''
        :param v_org: original image
        :param gt_verb: ground truth verb id
        :return: predicted role label logits
        '''

        img_features = self.convnet(v_org)
        batch_size, n_channel, conv_h, conv_w = img_features.size()

        img_org = img_features.view(batch_size, -1, conv_h* conv_w)
        v = img_org.permute(0, 2, 1)

        batch_size = v.size(0)

        role_idx = self.encoder.get_role_ids_batch(gt_verb)

        if torch.cuda.is_available():
            role_idx = role_idx.to(torch.device('cuda'))

        img = v

        img = img.expand(self.encoder.max_role_count, img.size(0), img.size(1), img.size(2))

        img = img.transpose(0,1)
        img = img.contiguous().view(batch_size * self.encoder.max_role_count, -1, v.size(2))

        verb_embd = self.verb_emb(gt_verb)
        role_embd = self.role_emb(role_idx)

        verb_embed_expand = verb_embd.expand(self.encoder.max_role_count, verb_embd.size(0), verb_embd.size(1))
        verb_embed_expand = verb_embed_expand.transpose(0,1)
        #query for image reasoning
        concat_query = torch.cat([ verb_embed_expand, role_embd], -1)
        role_verb_embd = concat_query.contiguous().view(-1, role_embd.size(-1)*2)
        q_emb = self.query_composer(role_verb_embd)

        att = self.v_att(img, q_emb)
        v_emb = (att * img).sum(1)

        v_repr = self.v_net(v_emb)
        q_repr = self.q_net(q_emb)

        mfb_iq_eltwise = torch.mul(q_repr, v_repr)

        mfb_iq_drop = self.Dropout_C(mfb_iq_eltwise)

        #normalization to avoid model convergence to unsatisfactory local minima
        mfb_iq_resh = mfb_iq_drop.view(batch_size* self.encoder.max_role_count, 1, -1, 1)
        # sum pooling can be more useful if there are multiple heads like original MFB.
        # we kept out head count to 1 for final implementation, but experimented with multiple without considerable improvement.
        mfb_iq_sumpool = torch.sum(mfb_iq_resh, 3, keepdim=True)
        mfb_out = torch.squeeze(mfb_iq_sumpool)
        mfb_sign_sqrt = torch.sqrt(F.relu(mfb_out)) - torch.sqrt(F.relu(-mfb_out))
        mfb_l2 = F.normalize(mfb_sign_sqrt)
        out = mfb_l2

        logits = self.classifier(out)

        role_label_pred = logits.contiguous().view(v.size(0), self.encoder.max_role_count, -1)

        return role_label_pred

    def forward_hiddenrep(self, v_org, gt_verb):

        '''
        :param v_org: original image
        :param gt_verb: ground truth verb id
        :return: hidden representation which is the input to the classifier
        '''

        img_features = self.convnet(v_org)
        batch_size, n_channel, conv_h, conv_w = img_features.size()

        img_org = img_features.view(batch_size, -1, conv_h* conv_w)
        v = img_org.permute(0, 2, 1)

        batch_size = v.size(0)

        role_idx = self.encoder.get_role_ids_batch(gt_verb)

        if torch.cuda.is_available():
            role_idx = role_idx.to(torch.device('cuda'))

        img = v

        img = img.expand(self.encoder.max_role_count, img.size(0), img.size(1), img.size(2))

        img = img.transpose(0,1)
        img = img.contiguous().view(batch_size * self.encoder.max_role_count, -1, v.size(2))

        verb_embd = self.verb_emb(gt_verb)
        role_embd = self.role_emb(role_idx)

        verb_embed_expand = verb_embd.expand(self.encoder.max_role_count, verb_embd.size(0), verb_embd.size(1))
        verb_embed_expand = verb_embed_expand.transpose(0,1)
        concat_query = torch.cat([ verb_embed_expand, role_embd], -1)
        role_verb_embd = concat_query.contiguous().view(-1, role_embd.size(-1)*2)
        q_emb = self.query_composer(role_verb_embd)

        att = self.v_att(img, q_emb)
        v_emb = (att * img).sum(1)

        v_repr = self.v_net(v_emb)
        q_repr = self.q_net(q_emb)

        mfb_iq_eltwise = torch.mul(q_repr, v_repr)

        mfb_iq_drop = self.Dropout_C(mfb_iq_eltwise)

        mfb_iq_resh = mfb_iq_drop.view(batch_size* self.encoder.max_role_count, 1, -1, 1)   # N x 1 x 1000 x 5
        mfb_iq_sumpool = torch.sum(mfb_iq_resh, 3, keepdim=True)    # N x 1 x 1000 x 1
        mfb_out = torch.squeeze(mfb_iq_sumpool)                     # N x 1000
        mfb_sign_sqrt = torch.sqrt(F.relu(mfb_out)) - torch.sqrt(F.relu(-mfb_out))
        mfb_l2 = F.normalize(mfb_sign_sqrt)
        out = mfb_l2

        return out

    def forward_agentplace_noverb(self, v_org, pred_verb):

        max_role_count = 2

        img_features = self.convnet(v_org)
        batch_size, n_channel, conv_h, conv_w = img_features.size()

        img_org = img_features.view(batch_size, -1, conv_h* conv_w)
        v = img_org.permute(0, 2, 1)

        batch_size = v.size(0)

        role_idx = self.encoder.get_agent_place_ids_batch(batch_size)

        if torch.cuda.is_available():
            role_idx = role_idx.to(torch.device('cuda'))

        img = v

        img = img.expand(max_role_count, img.size(0), img.size(1), img.size(2))

        img = img.transpose(0,1)
        img = img.contiguous().view(batch_size * max_role_count, -1, v.size(2))

        #verb_embd = torch.sum(self.verb_emb.weight, 0)
        #verb_embd = verb_embd.expand(batch_size, verb_embd.size(-1))
        #verb_embd = torch.zeros(batch_size, 300).cuda()
        verb_embd = self.verb_emb(pred_verb)

        role_embd = self.role_emb(role_idx)

        verb_embed_expand = verb_embd.expand(max_role_count, verb_embd.size(0), verb_embd.size(1))
        verb_embed_expand = verb_embed_expand.transpose(0,1)
        concat_query = torch.cat([ verb_embed_expand, role_embd], -1)
        role_verb_embd = concat_query.contiguous().view(-1, role_embd.size(-1)*2)
        q_emb = self.query_composer(role_verb_embd)

        att = self.v_att(img, q_emb)
        v_emb = (att * img).sum(1)

        v_repr = self.v_net(v_emb)
        q_repr = self.q_net(q_emb)

        mfb_iq_eltwise = torch.mul(q_repr, v_repr)

        mfb_iq_drop = self.Dropout_C(mfb_iq_eltwise)

        mfb_iq_resh = mfb_iq_drop.view(batch_size* max_role_count, 1, -1, 1)   # N x 1 x 1000 x 5
        mfb_iq_sumpool = torch.sum(mfb_iq_resh, 3, keepdim=True)    # N x 1 x 1000 x 1
        mfb_out = torch.squeeze(mfb_iq_sumpool)                     # N x 1000
        mfb_sign_sqrt = torch.sqrt(F.relu(mfb_out)) - torch.sqrt(F.relu(-mfb_out))
        mfb_l2 = F.normalize(mfb_sign_sqrt)
        out = mfb_l2

        logits = self.classifier(out)

        role_label_pred = logits.contiguous().view(v.size(0), max_role_count, -1)
        role_label_rep = v_repr.contiguous().view(v.size(0), max_role_count, -1)

        return role_label_pred, role_label_rep

    def calculate_loss(self, gt_verbs, role_label_pred, gt_labels):

        batch_size = role_label_pred.size()[0]
        criterion = nn.CrossEntropyLoss(ignore_index=self.encoder.get_num_labels())

        gt_label_turned = gt_labels.transpose(1,2).contiguous().view(batch_size* self.encoder.max_role_count*3, -1)

        role_label_pred = role_label_pred.contiguous().view(batch_size* self.encoder.max_role_count, -1)
        role_label_pred = role_label_pred.expand(3, role_label_pred.size(0), role_label_pred.size(1))
        role_label_pred = role_label_pred.transpose(0,1)
        role_label_pred = role_label_pred.contiguous().view(-1, role_label_pred.size(-1))

        loss = criterion(role_label_pred, gt_label_turned.squeeze(1)) * 3

        return loss

def build_top_down_baseline(n_roles, n_verbs, num_ans_classes, encoder):

    hidden_size = 1024
    word_embedding_size = 300
    img_embedding_size = 512

    covnet = vgg16_modified()
    role_emb = nn.Embedding(n_roles+1, word_embedding_size, padding_idx=n_roles)
    verb_emb = nn.Embedding(n_verbs, word_embedding_size)
    query_composer = FCNet([word_embedding_size * 2, hidden_size])
    v_att = Attention(img_embedding_size, hidden_size, hidden_size)
    q_net = FCNet([hidden_size, hidden_size ])
    v_net = FCNet([img_embedding_size, hidden_size])
    classifier = SimpleClassifier(
        hidden_size, 2 * hidden_size, num_ans_classes, 0.5)

    Dropout_C = nn.Dropout(0.1)

    return Top_Down_Baseline(covnet, role_emb, verb_emb, query_composer, v_att, q_net,
                                                           v_net, classifier, encoder, Dropout_C)



This is training loop.

import torch
import json
import os

from utils import utils, imsitu_scorer, imsitu_loader, imsitu_encoder
from models import top_down_baseline


def train(model, train_loader, dev_loader, optimizer, scheduler, max_epoch, model_dir, encoder, gpu_mode, clip_norm, model_name, model_saving_name, eval_frequency=4000):
    model.train()
    train_loss = 0
    total_steps = 0
    print_freq = 400
    dev_score_list = []

    if gpu_mode > 0 :
        ngpus = 2
        device_array = [i for i in range(0,ngpus)]

        pmodel = torch.nn.DataParallel(model, device_ids=device_array)
    else:
        pmodel = model

    top1 = imsitu_scorer.imsitu_scorer(encoder, 1, 3)
    top5 = imsitu_scorer.imsitu_scorer(encoder, 5, 3)

    for epoch in range(max_epoch):

        for i, (_, img, verb, labels) in enumerate(train_loader):
            total_steps += 1

            if gpu_mode >= 0:
                
                img = torch.autograd.Variable(img.cuda())
                verb = torch.autograd.Variable(verb.cuda())
                labels = torch.autograd.Variable(labels.cuda())
            else:
                img = torch.autograd.Variable(img)
                verb = torch.autograd.Variable(verb)
                labels = torch.autograd.Variable(labels)

            role_predict = pmodel(img, verb)
            loss = model.calculate_loss(verb, role_predict, labels)

            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), clip_norm)

            optimizer.step()
            optimizer.zero_grad()

            train_loss += loss.item()

            top1.add_point_noun(verb, role_predict, labels)
            top5.add_point_noun(verb, role_predict, labels)


            if total_steps % print_freq == 0:
                top1_a = top1.get_average_results_nouns()
                top5_a = top5.get_average_results_nouns()
                print ("{},{},{}, {} , {}, loss = {:.2f}, avg loss = {:.2f}"
                       .format(total_steps-1,epoch,i, utils.format_dict(top1_a, "{:.2f}", "1-"),
                               utils.format_dict(top5_a,"{:.2f}","5-"), loss.item(),
                               train_loss / ((total_steps-1)%eval_frequency) ))


            if total_steps % eval_frequency == 0:
                top1, top5, val_loss = eval(model, dev_loader, encoder, gpu_mode)
                model.train()

                top1_avg = top1.get_average_results_nouns()
                top5_avg = top5.get_average_results_nouns()

                avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
                            top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
                avg_score /= 8

                print ('Dev {} average :{:.2f} {} {}'.format(total_steps-1, avg_score*100,
                                                             utils.format_dict(top1_avg,'{:.2f}', '1-'),
                                                             utils.format_dict(top5_avg, '{:.2f}', '5-')))
                dev_score_list.append(avg_score)
                max_score = max(dev_score_list)

                if max_score == dev_score_list[-1]:
                    torch.save(model.state_dict(), model_dir + "/{}_{}.model".format( model_name, model_saving_name))
                    print ('New best model saved! {0}'.format(max_score))

                print('current train loss', train_loss)
                train_loss = 0
                top1 = imsitu_scorer.imsitu_scorer(encoder, 1, 3)
                top5 = imsitu_scorer.imsitu_scorer(encoder, 5, 3)

            del role_predict, loss, img, verb, labels
        print('Epoch ', epoch, ' completed!')
        scheduler.step()

def eval(model, dev_loader, encoder, gpu_mode, write_to_file = False):
    model.eval()

    print ('evaluating model...')
    top1 = imsitu_scorer.imsitu_scorer(encoder, 1, 3, write_to_file)
    top5 = imsitu_scorer.imsitu_scorer(encoder, 5, 3)
    with torch.no_grad():

        for i, (img_id, img, verb, labels) in enumerate(dev_loader):

            print(img_id[0], encoder.verb2_role_dict[encoder.verb_list[verb[0]]])

            if gpu_mode >= 0:
                img = torch.autograd.Variable(img.cuda())
                verb = torch.autograd.Variable(verb.cuda())
                labels = torch.autograd.Variable(labels.cuda())
                labels = torch.autograd.Variable(labels.cuda())
            else:
                img = torch.autograd.Variable(img)
                verb = torch.autograd.Variable(verb)
                labels = torch.autograd.Variable(labels)

            role_predict = model(img, verb)

            top1.add_point_noun(verb, role_predict, labels)
            top5.add_point_noun(verb, role_predict, labels)

            del role_predict, img, verb, labels

    return top1, top5, 0


def main():

    import argparse
    parser = argparse.ArgumentParser(description="imsitu VSRL. Training, evaluation and prediction.")
    parser.add_argument("--gpuid", default=0, help="put GPU id > -1 in GPU mode", type=int)
    parser.add_argument('--output_dir', type=str, default='./main-TDA', help='Location to output the model')
    parser.add_argument('--resume_training', action='store_true', help='Resume training from the model [resume_model]')
    parser.add_argument('--resume_model', type=str, default='', help='The model we resume')
    parser.add_argument('--evaluate', action='store_true', help='Only use the testing mode')
    parser.add_argument('--evaluate_visualize', action='store_true', help='Only use the testing mode to visualize ')
    parser.add_argument('--evaluate_rare', action='store_true', help='Only use the testing mode')
    parser.add_argument('--test', action='store_true', help='Only use the testing mode')
    parser.add_argument('--dataset_folder', type=str, default='./imSitu', help='Location of annotations')
    parser.add_argument('--imgset_dir', type=str, default='./resized_256', help='Location of original images')
    parser.add_argument('--train_file', default="train_freq2000.json", type=str, help='trainfile name')
    parser.add_argument('--dev_file', default="dev_freq2000.json", type=str, help='dev file name')
    parser.add_argument('--test_file', default="test_freq2000.json", type=str, help='test file name')
    parser.add_argument('--model_saving_name', type=str, help='saving name of the outpul model')

    parser.add_argument('--epochs', type=int, default=200)
    parser.add_argument('--model', type=str, default='top_down_baseline')
    parser.add_argument('--batch_size', type=int, default=8)
    parser.add_argument('--seed', type=int, default=1111, help='random seed')
    parser.add_argument('--clip_norm', type=float, default=0.25)
    parser.add_argument('--num_workers', type=int, default=3)

    args = parser.parse_args()

    n_epoch = args.epochs
    batch_size = args.batch_size
    clip_norm = args.clip_norm
    n_worker = args.num_workers

    dataset_folder = args.dataset_folder
    imgset_folder = args.imgset_dir

    train_set = json.load(open(dataset_folder + '/' + args.train_file))

    encoder = imsitu_encoder.imsitu_encoder(train_set)

    train_set = imsitu_loader.imsitu_loader(imgset_folder, train_set, encoder,'train', encoder.train_transform)

    constructor = 'build_%s' % args.model
    model = getattr(top_down_baseline, constructor)(encoder.get_num_roles(),encoder.get_num_verbs(), encoder.get_num_labels(), encoder)

    train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size, shuffle=True, num_workers=n_worker)

    dev_set = json.load(open(dataset_folder + '/' + args.dev_file))
    dev_set = imsitu_loader.imsitu_loader(imgset_folder, dev_set, encoder, 'val', encoder.dev_transform)
    dev_loader = torch.utils.data.DataLoader(dev_set, batch_size=batch_size, shuffle=True, num_workers=n_worker)

    test_set = json.load(open(dataset_folder + '/' + args.test_file))
    test_set = imsitu_loader.imsitu_loader(imgset_folder, test_set, encoder, 'test', encoder.dev_transform)
    test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size, shuffle=True, num_workers=n_worker)

    if not os.path.exists(args.output_dir):
        os.mkdir(args.output_dir)

    torch.manual_seed(args.seed)
    if args.gpuid >= 0:
        model.cuda()
        torch.cuda.manual_seed(args.seed)
        torch.backends.cudnn.benchmark = True

    if args.resume_training:
        print('Resume training from: {}'.format(args.resume_model))
        args.train_all = True
        if len(args.resume_model) == 0:
            raise Exception('[pretrained module] not specified')
        utils.load_net(args.resume_model, [model])
        optimizer = torch.optim.Adamax(model.parameters(), lr=1e-3)
        model_name = 'resume_all'

    else:
        print('Training from the scratch.')
        model_name = 'train_full'
        utils.set_trainable(model, True)
        optimizer = torch.optim.Adamax([
            {'params': model.convnet.parameters(), 'lr': 5e-5},
            {'params': model.role_emb.parameters()},
            {'params': model.verb_emb.parameters()},
            {'params': model.query_composer.parameters()},
            {'params': model.v_att.parameters()},
            {'params': model.q_net.parameters()},
            {'params': model.v_net.parameters()},
            {'params': model.classifier.parameters()}
        ], lr=1e-3)

    scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

    if args.evaluate:
        top1, top5, val_loss = eval(model, dev_loader, encoder, args.gpuid)

        top1_avg = top1.get_average_results_nouns()
        top5_avg = top5.get_average_results_nouns()

        avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
                    top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
        avg_score /= 8

        print ('Dev average :{:.2f} {} {}'.format( avg_score*100,
                                                   utils.format_dict(top1_avg,'{:.2f}', '1-'),
                                                   utils.format_dict(top5_avg, '{:.2f}', '5-')))



    elif args.test:
        top1, top5, val_loss = eval(model, test_loader, encoder, args.gpuid)

        top1_avg = top1.get_average_results_nouns()
        top5_avg = top5.get_average_results_nouns()

        avg_score = top1_avg["verb"] + top1_avg["value"] + top1_avg["value-all"] + top5_avg["verb"] + \
                    top5_avg["value"] + top5_avg["value-all"] + top5_avg["value*"] + top5_avg["value-all*"]
        avg_score /= 8

        print ('Test average :{:.2f} {} {}'.format( avg_score*100,
                                                    utils.format_dict(top1_avg,'{:.2f}', '1-'),
                                                    utils.format_dict(top5_avg, '{:.2f}', '5-')))


    else:

        print('Model training started!')
        train(model, train_loader, dev_loader, optimizer, scheduler, n_epoch, args.output_dir, encoder, args.gpuid, clip_norm, model_name, args.model_saving_name,
              )






if __name__ == "__main__":
    main()













The solution provided by jeff is not working for me. @ptrblck @albanD can you help me?

Could you reduce the code by removing unnecessary parts and post an executable code snippet to reproduce the issue, please?

Hi, I’m facing a similar issue but I don’t know what causes this error to rise.
This is my model

class ResidualBlock(nn.Module):
  def __init__(self, in_channels, kernel_size=1, stride=1, padding=0, num_res = 2):
    if num_res%2 !=0:
      raise ValueError(f'num_res={num_res} which is not even, num_res must be even')
    super(ResidualBlock, self).__init__()
    layers = []

    for _ in range(num_res):
      layers.append(nn.Conv2d(in_channels = in_channels,
                  out_channels = in_channels, kernel_size=kernel_size, stride=stride, padding=padding))
      layers.append(nn.ReLU())

    self.seq = nn.Sequential(*layers)

  def forward(self, x):
    res = torch.clone(x)

    for i in range(len(self.seq)):
      x = self.seq[i].forward(x)
      # print(self.seq[i])
      if (i+1)%4==0:
        # print('res')
        x+=res
        res = torch.clone(x)

    return x

class Resnet16(nn.Module):
  def __init__(self, in_channels=3):
    super(Resnet16, self).__init__()
    self.convnet = nn.Sequential(
        nn.Conv2d(in_channels=in_channels, out_channels=64, kernel_size=3, stride=1, padding=1),
        nn.ReLU(),
        ResidualBlock(in_channels=64,kernel_size=3, stride=1, padding=1,num_res=4),
        nn.Conv2d(in_channels=64, out_channels=256, kernel_size=3, stride=2, padding=1),
        nn.ReLU(),
        ResidualBlock(in_channels=256, kernel_size=3, stride=1, padding=1, num_res=4),
        nn.BatchNorm2d(256),
        nn.Conv2d(in_channels=256, out_channels=512, kernel_size=3, stride=2, padding=1),
        nn.ReLU(),
        ResidualBlock(in_channels=512, kernel_size=3, stride=1, padding=1, num_res=4),
        nn.AvgPool2d(kernel_size=3),
    )

    self.fcnet = nn.Sequential(
        nn.Linear(2048,512),
        nn.ReLU(),
        nn.Linear(512,256),
        nn.ReLU(),
        nn.Linear(256,10)
    )


  def forward(self,x):
    x = self.convnet(x)
    bs = x.shape[0]
    x = x.view(bs,-1)
    x = self.fcnet(x)
    return x

And this is the part of the optimizer that optimizes my model

from torch.optim import Adam

# model = VGG()
model = Resnet16(3)
optimizer_params = {
    'params': model.parameters(),
    'lr':1e-3
}

optimizer = Adam(optimizer_params['params'],optimizer_params['lr'])

trainer_params = {
    'model': model,
    'train_loader': trainloader,
    'valid_loader': testloader,
    'criterion': nn.CrossEntropyLoss(),
    'optimizer': optimizer,
    'epochs': 10,
    'print_every':2,
    'pth_filename': 'model.pth',
    'trainset_sz': len(trainset),
    'validset_sz': len(testset)
}

.
.
.
.

for data, labels in train_loader:
      # putting data in working device
      data = data.to(device)
      labels = labels.to(device)

      # forward propagation
      output = model(data)
      loss = criterion(output, labels)

      # backward propagation
      optimizer.zero_grad()
      loss.backward()
      optimizer.step()

      # logging training loop results
      training_loss += loss.item() 

Any help will be much appreciated @ptrblck @albanD!

Try to remove all inplace operations such as x += res and see, if this would help.

5 Likes

It worked, thanks so much!!

hi guys, I’m facing the similar issue when I want to do backward() too. really need your help. Big thanks for your help here. :grin:
@ptrblck @albanD
code in training process:

def train(rho_data, size, train_size, mine_net, optimizer, iteration, input_size, tau):
    criterion = nn.BCEWithLogitsLoss()
    diff_et = torch.tensor(0.0)
    data, test_p0, test_q0, label, train_index, marg_index = recons_data(rho_data, size, 
                                                                           train_size) 
    for i in range(iteration):
    batch_size = int(len(data)/4)
    if input_size == 2:  
        test_p = torch.FloatTensor(test_p0[:,[0,2]])
        test_q = torch.FloatTensor(test_q0[:,[0,2]])
        
    else: 
        test_p = torch.FloatTensor(test_p0)
        test_q = torch.FloatTensor(test_q0)
    
    train_batch, index1, index2 = sample_batch(data, input_size, 
                                               batch_size = batch_size, 
                                               sample_mode = 'joint')
    label_batch = label[index1]
    train_batch = torch.autograd.Variable(torch.FloatTensor(train_batch), requires_grad=True)
    label_batch = torch.FloatTensor(label_batch)
    
    logit = mine_net(train_batch)[0]
    loss = criterion(logit.reshape(-1), label_batch)
    
    if i < iteration-1:
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    else:
        optimizer.zero_grad()
        loss.backward(retain_graph = True)
        optimizer.step()       
        train_batch.grad.zero_()
        loss.backward()
        grads = train_batch.grad
    
    if i >= iteration-101:            
        prob_p = mine_net(test_p)[1]
        rn_est_p = prob_p/(1-prob_p)
        finp_p = torch.log(torch.abs(rn_est_p))
        
        prob_q = mine_net(test_q)[1]
        rn_est_q = prob_q/(1-prob_q)
        a = torch.abs(rn_est_q)
        clip = torch.max(torch.min(a,torch.exp(tau)), torch.exp(-tau))        
        diff_et = diff_et+torch.max(torch.mean(finp_p)-torch.log(torch.mean(clip)), torch.tensor(0.0))
        
return (diff_et/100).detach().cpu().numpy(), grads, index1, train_index, marg_index

problem from system:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [130, 1]], which is output 0 of AsStridedBackward0, is at version 1002; expected version 1001 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

Could you explain your use case and in particular why you are using retain_graph=True as this ususally yields these kind of errors (and is often used as a workaround for another error)?

Hi ptrblck, many thanks for your help here. I have solved this bug here.


else:
    optimizer.zero_grad()
    loss.backward(retain_graph = True)
    optimizer.step()       
    train_batch.grad.zero_()
    loss.backward()
    grads = train_batch.grad

Hi guys . I met the problem with loss.backward() as you can see here
File “train.py”, line 360, in train
loss_adv.backward(retain_graph=True)
File “/usr/local/lib/python3.7/dist-packages/torch/_tensor.py”, line 396, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph, inputs=inputs)
File “/usr/local/lib/python3.7/dist-packages/torch/autograd/init.py”, line 175, in backward
allow_unreachable=True, accumulate_grad=True) # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [512, 7]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).

My code is


I use pytorch 1.12.1 in google colab
Can anyone help me to solve this problem .Thank you very much
@ptrblck @albanD can you help me

Could you also check why retain_graph is used in your code?