One of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1431, 20]],

hfzarslan · July 2, 2021, 1:20pm

I am getting this error can anyone help me to resolve the issue

/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/torch/autograd/__init__.py:130: UserWarning: Error detected in MmBackward. Traceback of forward call that caused the error:
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/runpy.py", line 194, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/traitlets/config/application.py", line 845, in launch_instance
    app.start()
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/ipykernel/kernelapp.py", line 612, in start
    self.io_loop.start()
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/platform/asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/asyncio/base_events.py", line 570, in run_forever
    self._run_once()
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/asyncio/base_events.py", line 1859, in _run_once
    handle._run()
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/asyncio/events.py", line 81, in _run
    self._context.run(self._callback, *self._args)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/ioloop.py", line 688, in <lambda>
    lambda f: self._run_callback(functools.partial(callback, future))
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/ioloop.py", line 741, in _run_callback
    ret = callback()
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/gen.py", line 814, in inner
    self.ctx_run(self.run)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/gen.py", line 775, in run
    yielded = self.gen.send(value)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 358, in process_one
    yield gen.maybe_future(dispatch(*args))
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 261, in dispatch_shell
    yield gen.maybe_future(handler(stream, idents, msg))
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/ipykernel/kernelbase.py", line 536, in execute_request
    self.do_execute(
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/tornado/gen.py", line 234, in wrapper
    yielded = ctx_run(next, result)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/ipykernel/ipkernel.py", line 302, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/ipykernel/zmqshell.py", line 539, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2898, in run_cell
    result = self._run_cell(
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2944, in _run_cell
    return runner(coro)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/IPython/core/async_helpers.py", line 68, in _pseudo_sync_runner
    coro.send(None)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3169, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3361, in run_ast_nodes
    if (await self.run_code(code, result,  async_=asy)):
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3441, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-140-052c71d28f57>", line 2, in <module>
    net, train_loss, train_accuracy, val_loss, val_accuracy = train(train_loader,  valid_loader, lr=0.0001, epochs=3,  hidden_units=20, net='RNN')
  File "<ipython-input-139-ead948683b23>", line 36, in train
    net_out, h = model(data,h)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "<ipython-input-138-9127ab8f7b70>", line 33, in forward
    return self.rnn_cell(inp, prev_h)
  File "<ipython-input-138-9127ab8f7b70>", line 22, in rnn_cell
    second_x = self.xh(x.detach())
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
    result = self.forward(*input, **kwargs)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/torch/nn/modules/linear.py", line 93, in forward
    return F.linear(input, self.weight, self.bias)
  File "/Users/arslan/anaconda3/envs/data_work/lib/python3.8/site-packages/torch/nn/functional.py", line 1692, in linear
    output = input.matmul(weight.t())
 (Triggered internally at  /Users/distiller/project/conda/conda-bld/pytorch_1603740477510/work/torch/csrc/autograd/python_anomaly_mode.cpp:104.)
  Variable._execution_engine.run_backward(
---------------------------------------------------------------------------
RuntimeError                              Traceback (most recent call last)
<ipython-input-140-052c71d28f57> in <module>
      1 torch.autograd.set_detect_anomaly(True)
----> 2 net, train_loss, train_accuracy, val_loss, val_accuracy = train(train_loader,  valid_loader, lr=0.0001, epochs=3,  hidden_units=20, net='RNN')

<ipython-input-139-ead948683b23> in train(train_x, valid_x, lr, epochs, hidden_units, net)
     45             label = label.float()
     46             loss = criterion(net_out, label)
---> 47             loss.backward(retain_graph=True)
     48             optimizer.step()
     49 

~/anaconda3/envs/data_work/lib/python3.8/site-packages/torch/tensor.py in backward(self, gradient, retain_graph, create_graph)
    219                 retain_graph=retain_graph,
    220                 create_graph=create_graph)
--> 221         torch.autograd.backward(self, gradient, retain_graph, create_graph)
    222 
    223     def register_hook(self, hook):

~/anaconda3/envs/data_work/lib/python3.8/site-packages/torch/autograd/__init__.py in backward(tensors, grad_tensors, retain_graph, create_graph, grad_variables)
    128         retain_graph = create_graph
    129 
--> 130     Variable._execution_engine.run_backward(
    131         tensors, grad_tensors_, retain_graph, create_graph,
    132         allow_unreachable=True)  # allow_unreachable flag

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [1431, 20]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

class RNN(nn.Module):
    
    def __init__(self,input_size, output_size, hidden_size=64):

        super().__init__()

        self.input_size  = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.xh = nn.Linear(self.input_size, self.hidden_size, bias=False)
        self.hh = nn.Linear(self.hidden_size, self.hidden_size)
        self.hy = nn.Linear(self.hidden_size, self.output_size)
        
        
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.sigmoid = nn.Sigmoid()

    def rnn_cell(self, x, prev_h):  
        first_h = self.hh(prev_h)
        second_x = self.xh(x)

        act = second_x + first_h
        h = self.tanh(act)

        updated_c = self.sigmoid(self.hy(h))

        return updated_c, h


    def forward(self, inp, prev_h):
        return self.rnn_cell(inp, prev_h)

ptrblck · July 2, 2021, 9:55pm

I cannot reproduce the issue using:

model = RNN(1, 1)
x = torch.randn(1, 1)
target = torch.randint(0, 2, (1, 1)).float()
criterion = nn.BCELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)

for epoch in range(10):
    optimizer.zero_grad()
    prev = torch.randn(1, 64)
    output, h = model(x, prev)
    loss = criterion(output, target)
    loss.backward()
    optimizer.step()
    prev = h.detach()

Could you post an executable code snippet, which would reproduce the error you are seeing?

hfzarslan · July 3, 2021, 6:31am

hfzarslan:

class RNN(nn.Module):
    
    def __init__(self,input_size, output_size, hidden_size=64):

        super().__init__()

        self.input_size  = input_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        
        self.xh = nn.Linear(self.input_size, self.hidden_size, bias=False)
        self.hh = nn.Linear(self.hidden_size, self.hidden_size)
        self.hy = nn.Linear(self.hidden_size, self.output_size)
        
        
        self.tanh = nn.Tanh()
        self.softmax = nn.Softmax(dim=1)
        self.sigmoid = nn.Sigmoid()

    def rnn_cell(self, x, prev_h):  
        first_h = self.hh(prev_h)
        second_x = self.xh(x)

        act = second_x + first_h
        h = self.tanh(act)

        updated_c = self.sigmoid(self.hy(h))

        return updated_c, h


    def forward(self, inp, prev_h):
        return self.rnn_cell(inp, prev_h)

this is the training code

def train(train_x,  valid_x, lr, epochs, hidden_units, net='RNN'):
    
    for step, (data, label) in enumerate(train_x):
        inputs = np.array(data)
        break

    if net=='RNN':
        h = torch.zeros(hidden_units).requires_grad_()
        model = RNN(inputs.shape[1], 1, hidden_units)
    elif net == 'LSTM':
        h = torch.zeros(hidden_units).requires_grad_()
        c = torch.zeros(hidden_units).requires_grad_()
        model = LSTM(inputs.shape[1], 1, hidden_units)
    elif net == 'GRU':
        St_1 = torch.zeros(hidden_units).requires_grad_()
        model = GRUModel(inputs.shape[1], 1, hidden_units)
    model.to(device)
    
    
    train_loss, val_loss = [],[]
    train_accuracy, val_accuracy = [], []
    optimizer = torch.optim.SGD(model.parameters(), lr=lr, momentum=0.9)
    criterion = nn.BCELoss()

    
    
    for ep in range(epochs):
        running_loss, correct = 0, 0
        for i, (data, label) in enumerate(train_x):
            data, label = Variable(data), Variable(label)
            data, label = data.to(device), label.to(device)
            
            optimizer.zero_grad()
            
            if net == 'RNN':
                net_out, h = model(data, h)
            elif net == 'LSTM':
                net_out, h, c = model(data, h, c)
            elif net == 'GRU':
                net_out, St_1 = model(data, St_1)
                
            
            label = torch.reshape(label, (label.shape[0], 1))
            net_out = torch.reshape(net_out, (label.shape[0], 1))
            label = label.float()
            loss = criterion(net_out, label)
            loss.backward(retain_graph=True)
            optimizer.step()

            running_loss += loss.item()
#             pred = torch.argmax(net_out, axis=1)  # get the index of the max log-probability
#             actual = torch.argmax(label, axis=1)
            out = (net_out>0.5).float()
            correct += out.eq(label).sum()



        print(running_loss)
        print("Epoch:", ep)
        print(correct.item())
        print("Training Accuracy:", 100. * correct.item() / len(train_x.dataset))
        print("Train Loss:", running_loss / len(train_x.dataset))
        train_loss.append(running_loss / len(train_x.dataset))
        train_accuracy.append(correct / len(train_x.dataset))


#         test_loss = 0
#         correct = 0
#         with torch.no_grad():
#             for batch_idx, (data, target) in enumerate(valid_x):
#                 data, target = Variable(data), Variable(target)
#                 data, target = data.to(device), target.to(device)
#     #                 data = data.view(-1, 784)
#                 if net == 'RNN':
#                     net_out, _ = model(data, h)
#                 elif net == 'LSTM':
#                     net_out, _, _ = model(data, h, c)
#                 elif net == 'GRU':
#                     net_out, _ = model(data, St_1)
#                 net_out = torch.reshape(net_out, (net_out.shape[0],))
#                 # sum up batch loss
#                 target = target.float()
#                 test_loss += criterion(net_out, target).item()
#     #                 pred = torch.argmax(net_out, axis=1)  # get the index of the max log-probability
#     #                 actual = torch.argmax(label, axis=1)
#                 out = (net_out>0.5).float()
#                 correct += out.eq(target).sum()
#             val_loss.append(test_loss / len(valid_x.dataset))
#             val_accuracy.append(correct / len(valid_x.dataset))

#         print("Validation Accuracy:" , 100. * correct.item() / len(valid_x.dataset))
#         print("Validation Loss:", test_loss / len(valid_x.dataset)) 
#         print("----------------------------------------------------------")
    
    return model, train_loss, train_accuracy, val_loss, val_accuracy

ptrblck · July 4, 2021, 9:22pm

The code snippet is unfortunately not executable, so please add the missing code to create random tensors in the expected shapes, which would then reproduce this issue.

hfzarslan · July 5, 2021, 10:28am

Here is the whole code to generate data

class MyDataset(Dataset):
    def __init__(self, data, targets):
        self.data = torch.from_numpy(data).type(torch.float)
        self.targets = targets

    def __getitem__(self, index):
        x = self.data[index]
        y = self.targets[index]
        return x, y

    def __len__(self):
        return len(self.data)

def numpy_to_dataloader(data, targets, batch_size = 50, validation_split = .2, shuffle_dataset = True):
    
    random_seed= 42
    data = data
    targets = targets
    dataset = MyDataset(data, targets)
    


    dataset_size = len(dataset)
    indices = list(range(dataset_size))
    split = int(np.floor(validation_split * dataset_size))
    if shuffle_dataset :
        np.random.seed(random_seed)
        np.random.shuffle(indices)
    train_indices, val_indices = indices[split:], indices[:split]


    train_sampler = SubsetRandomSampler(train_indices)
    valid_sampler = SubsetRandomSampler(val_indices)

    train_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size, 
                                               sampler=train_sampler)
    validation_loader = torch.utils.data.DataLoader(dataset, batch_size=batch_size,
                                                    sampler=valid_sampler)
    
    return train_loader, validation_loader

dummy data and run the model


X = torch.randn(30, 20)
y = torch.randint(2, (30, ))
train_loader, valid_loader = numpy_to_dataloader(X, y, batch_size = 20, validation_split = .2, shuffle_dataset = True)
net, train_loss, train_accuracy, val_loss, val_accuracy = train(train_loader,  valid_loader, lr=0.0001, epochs=10,  hidden_units=64, net='RNN')

hfzarslan · July 16, 2021, 10:14am

@ptrblck can you please check the code thanks

ptrblck · July 16, 2021, 9:09pm

The code is unfortunately still not executable. After fixing some np.array/tensor issues as well as device mismatches, I get:

RuntimeError: The size of tensor a (4) must match the size of tensor b (20) at non-singleton dimension 0