Getting this Error Message: "RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation"

Hi, I’m a new person to PyTorch, and am trying to learn a basic RNN from scratch in PyTorch, but I keep getting this runtimeerror. I’ve seen multiple threads about this, and I don’t think I have any in-place operators like += or []. Why is this happening? Below is my entire error message, entire code, and the backtrace error message for the detect anomaly. I’d appreciate the help, thank you so much!

Error Message:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [186, 18]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Code:

import torch
from utils import *
import pdb
import json
import math
# import more functions needed on top of the given imported functions above:
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
from matplotlib import pyplot as plt
import time
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import pandas as pd

device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #run on gpu if possible

n_letters=58
n_categories=18
n_hidden = 128
n_epochs = 100 # changed from 100 to 50 for faster training
print_every = 5000
plot_every = 1000
learning_rate = 0.0005 # changed from 0.005 to 0.001

class RNN(torch.nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        """
        Args:
            input_shape (int): size of the 1-hot embeddings for each character (this will be 58)
            hidden_layer_width (int): number of nodes in the single hidden layer within the model
            n_classes (int): number of output classes
        """
        super(RNN, self).__init__()
        ### TODO Implement the network architecture
        self.hidden_size = hidden_size
        self.input_to_hidden = nn.Linear(input_size + hidden_size, hidden_size)
        self.input_to_output = nn.Linear(input_size + hidden_size, output_size)
        self.softmax = nn.Softmax(dim = 1)
        # raise NotImplementedError


    def forward(self, input, hidden):
        """Forward function accepts tensor of input data, returns tensor of output data.
        Modules defined in constructor are used, along with arbitrary operators on tensors
        """

        ### TODO Implement the forward function
        input = input.to(device)
        hidden = hidden.to(device)
        concat = torch.cat((input, hidden), dim = 1)
        concat = concat.to(device)
        hidden = torch.nn.functional.softmax(self.input_to_hidden(concat))
        output = self.input_to_output(concat)
        output = output.to(device)
        output = self.softmax(output)
        # your function will return the output y(t) and hidden h(t) from equation 1 in the docs
        return output, hidden

        # raise NotImplementedError

    def initHidden(self):
        """
        This function initializes the first hidden state of the RNN as a zero tensor.
        """
        return torch.zeros(1, self.hidden_size)

def get_xy_pairs(names):
    #TODO
    #process the names dict and convert into a list of (x,y) pairs. x is a 1-hot tensor of size (num_characters_in_name, 1, n_letters)
    #y is a scalar representing the category of the language, there are 18 languages, assign an index between 0-17 to each language and y represents this index.
    #you may make use of the nameToTensor() function in the utils.py file to help you with this function
    # Turn a line into a <line_length x 1 x n_letters>,
    # or an array of one-hot letter vectors

    y_list = [item for item in names.keys()] #keys
    x_list = [item for item in names.values()] #values
    label_y_list = [idx for (idx, item) in enumerate(y_list)] # 0-17
    x_list_length = [nameToTensor(x_list[idx1]).shape[0] for idx1 in range(len(label_y_list))] #num of names of each language
    label_y_list = [np.repeat(label_y_list[i], x_list_length[i]) for i in range(len(label_y_list))] #repeat
    label_y_list = np.array(np.concatenate(label_y_list).flat) #flatten
    x_list = np.array(np.concatenate(x_list).flat) #flatten
    name_x_list = [nameToTensor(x_list[i]) for i in range(len(x_list))] #tensor
    list_of_pairs = list(zip(name_x_list, label_y_list)) #combine into list
    return list_of_pairs

def create_train_and_test_set(list_of_pairs):
    #TODO
    #process the list of (x,y) pairs and split them 80-20 into train and test set
    #train_x is a list of name embeddings each of size (num_characters_in_name, 1, n_letters), train_y is the correponding list of language category index. Same for test_x and test_y

    x_list = [i for i, j in list_of_pairs]
    y_list = [j for i, j in list_of_pairs]
    train_x, test_x, train_y, test_y = train_test_split(x_list, y_list, test_size=0.2, random_state = 42, stratify = y_list)
    return train_x, train_y, test_x, test_y

rnn = RNN(n_letters, n_hidden, n_categories)
rnn.to(device)
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()

torch.autograd.set_detect_anomaly(True)

def train(train_x, train_y):
    """train_x and train_y are lists with names and correspoonding labels"""
    output_ra = []
    loss = 0
    for x, y in zip(train_x, train_y):
        hidden = rnn.initHidden()
        for i in range(x.size()[0]):
            output, hidden = rnn(x[i].clone(), hidden.clone())
        loss = loss + criterion(torch.log(output.to(device) + 1e-6), torch.tensor(y, dtype= torch.long).unsqueeze(0).to(device))
        loss = loss.to(device)
        optimizer.zero_grad()
        loss.backward(retain_graph = True)
        optimizer.step()
    output_ra.extend(output.detach().cpu().numpy())
    return loss, output_ra
    #     for i in range(x.size()[0]):
    #         output, hidden = rnn(x[i], hidden)
    #     loss += criterion(torch.log(output.to(device) + 1e-6), torch.tensor(y, dtype= torch.long).unsqueeze(0).to(device)) #the unsqueeze converts the scalar y to a 1D tensor
    #     loss = loss.to(device)
    #
    # output_ra.extend(output.detach().cpu().numpy())
    # optimizer.zero_grad()
    #
    # loss.backward()
    #
    # optimizer.step()
    #
    # return loss, output_ra

def test(train_x, train_y):
    """train_x and train_y are lists with names and correspoonding labels"""
    loss = 0
    output_ra = []
    with torch.no_grad():
        for x, y in zip(train_x, train_y):
            hidden = rnn.initHidden()
            for i in range(x.size()[0]):
                output, hidden = rnn(x[i], hidden)
            loss += criterion(torch.log(output.to(device) + 1e-6), torch.tensor(y, dtype= torch.long).unsqueeze(0).to(device))  #the unsqueeze converts the scalar y to a 1D tensor

            output_ra.extend(output.detach().cpu().numpy()) #also save the predicted labels at the last epoch for the confusion matrix

    return loss, output_ra

#names is your dataset in python dictionary form. Keys are languages and values are list of words belonging to that language
with open('names.json', 'r') as fp:
    names = json.load(fp)

list_of_pairs = get_xy_pairs(names)
train_x, train_y, test_x, test_y = create_train_and_test_set(list_of_pairs)

I get this error message when I call

for epoch in range(1, n_epochs + 1):     
   current_train_loss, train_pred = train(train_x, train_y)

Error message:

C:\Users\choke\anaconda3\lib\site-packages\torch\autograd\__init__.py:173: UserWarning: Error detected in AddmmBackward0. Traceback of forward call that caused the error:
  File "C:\Users\choke\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\choke\anaconda3\lib\runpy.py", line 87, in _run_code
    exec(code, run_globals)
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "C:\Users\choke\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
    app.start()
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
    self.io_loop.start()
  File "C:\Users\choke\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
    self.asyncio_loop.run_forever()
  File "C:\Users\choke\anaconda3\lib\asyncio\base_events.py", line 601, in run_forever
    self._run_once()
File "C:\Users\choke\anaconda3\lib\asyncio\base_events.py", line 1905, in _run_once
    handle._run()
  File "C:\Users\choke\anaconda3\lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
    await self.process_one()
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
    await dispatch(*args)
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
    await result
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
    reply_content = await reply_content
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
    return super().run_cell(*args, **kwargs)
  File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2863, in run_cell
result = self._run_cell(
  File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in _run_cell
    return runner(coro)
  File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3309, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\choke\AppData\Local\Temp\ipykernel_4776\3138297363.py", line 8, in <cell line: 7>
    current_train_loss, train_pred = train(train_x, train_y)
  File "C:\Users\choke\AppData\Local\Temp\ipykernel_4776\706923590.py", line 111, in train
    output, hidden = rnn(x[i].clone(), hidden.clone())
  File "C:\Users\choke\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
File "C:\Users\choke\AppData\Local\Temp\ipykernel_4776\706923590.py", line 54, in forward
    output = self.input_to_output(concat)
  File "C:\Users\choke\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
    return forward_call(*input, **kwargs)
  File "C:\Users\choke\anaconda3\lib\site-packages\torch\nn\modules\linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
 (Triggered internally at  C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:104.)

​ ​ ​ ​

These errors are often raised by using retain_graph = True as a workaround for another issue.
Could you explain why you are using it?
If you are not sure and added it to avoid the “trying to backpropagate a second time…” error, check if you have forgotten to detach the computation graph to avoid trying to recompute gradients from previous iterations.