RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [200, 100]], which is output 0 of AsStridedBackward0, is at version 3; expected version 1 instead. Hint: the backtrace further

I was trying to train a very basic RNN on Shakespeare text however I keep getting the following error and don’t know what is going wrong:

/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/__init__.py:197: UserWarning: Error detected in AddmmBackward0. Traceback of forward call that caused the error:
  File "/Users/aayush/Desktop/deeplearning/shakernn.py", line 99, in <module>
    train_network(model, loss_func, optimizer, lr, joined_lines, device)
  File "/Users/aayush/Desktop/deeplearning/shakernn.py", line 83, in train_network
    hidden_state, output = model(inp, hidden_state)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/Users/aayush/Desktop/deeplearning/shakernn.py", line 42, in forward
    hidden = self.i2h(combined)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
    return forward_call(*args, **kwargs)
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/nn/modules/linear.py", line 114, in forward
    return F.linear(input, self.weight, self.bias)
 (Triggered internally at /Users/runner/work/pytorch/pytorch/pytorch/torch/csrc/autograd/python_anomaly_mode.cpp:119.)
  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
Traceback (most recent call last):
  File "/Users/aayush/Desktop/deeplearning/shakernn.py", line 99, in <module>
    train_network(model, loss_func, optimizer, lr, joined_lines, device)
  File "/Users/aayush/Desktop/deeplearning/shakernn.py", line 88, in train_network
    loss.backward()
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/_tensor.py", line 488, in backward
    torch.autograd.backward(
  File "/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/torch/autograd/__init__.py", line 197, in backward
    Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [200, 100]], which is output 0 of AsStridedBackward0, is at version 3; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

Here is the code:

import torch
import torch.nn as nn
import torch.nn.functional as F
from pprint import pprint

device = torch.device("cpu")
torch.autograd.set_detect_anomaly(True)

class EmbeddingSubscriptable:
	def __init__(self, *args, **kwargs):
		self.emb = nn.Embedding(*args, **kwargs)

	def __getitem__(self, idx):
		return self.emb.weight[idx].unsqueeze(0)

embedding_size = 100
hidden_size = 100

with open("data/shakespeare.txt") as f:
	lines = f.readlines()

joined_lines = ''.join(lines)
vocab = sorted(set(joined_lines))
vocab_size = len(vocab)
itos = dict(enumerate(vocab))
stoi = {value:key for key, value in itos.items()}

vocab_embedding = EmbeddingSubscriptable(vocab_size, embedding_size)

class RNN(nn.Module):
	def __init__(self, input_size, hidden_size, output_size):
		super().__init__()

		self.i2h = nn.Linear(input_size + hidden_size, hidden_size)
		self.i2o = nn.Linear(input_size + hidden_size, output_size)

		self.tanh = nn.Tanh()
		self.softmax = nn.LogSoftmax(dim=1)


	def forward(self, inp, hidden_state):
		combined = torch.cat((inp, hidden_state), dim=1)
		hidden = self.i2h(combined)
		output = self.i2o(combined)
		hidden = self.tanh(hidden)
		output = self.softmax(output)
		return hidden, output

	def initHidden(self):
		hidden = torch.nn.init.kaiming_normal_(torch.zeros(hidden_size).reshape(1, hidden_size))
		return hidden

@torch.no_grad()
def predict(model, length=50, start=" "):
	global vocab_embedding

	hidden_state = model.initHidden()

	last_c = None
	
	for c in start:
		inp = vocab_embedding[stoi[c]]
		hidden_state, output = model(inp, hidden_state)
		last_c = torch.argmax(output)

	for i in range(length):
		inp = vocab_embedding[last_c]
		hidden_state, output = model(inp, hidden_state)
		last_c = torch.argmax(output)
		print(itos[int(last_c)], end="")

def train_network(model, loss_func, optimizer, lr, data, device):
	model
	chunk = 1000
	curr_count = 0
	hidden_state = model.initHidden()
	optimizer.zero_grad()
	loss = 0
	for i in range(len(data) - 1):
		print(i)
		inp = vocab_embedding[stoi[data[i]]]
		label = torch.tensor(stoi[data[i + 1]]).unsqueeze(0)

		hidden_state, output = model(inp, hidden_state)
		loss = loss + loss_func(output, label)
		curr_count += 1

		if curr_count == chunk:
			loss.backward()
			optimizer.step()
			optimizer.zero_grad()
			curr_count = 0
			loss = 0


loss_func = nn.NLLLoss()
lr = 0.01
model = RNN(embedding_size, hidden_size, vocab_size)
optimizer = torch.optim.AdamW(model.parameters(), lr=lr)
train_network(model, loss_func, optimizer, lr, joined_lines, device)

I know it is usually caused my in-place operations but I really can’t see anything going wrong besides the return self.emb.weight[idx].unsqueeze(0) but even that seemed to work fine with autograd when I did a small test with it in IPython. It says its happening in the hidden = self.i2h(combined) from what I see but I have no idea whats going wrong there either.

I guess the error is raised in the hidden_state tensor since you are reusing it in each iteration while backward() is called after chunk number of iterations.
You might want to detach() the hidden_state tensor inside the if-condition.

Ok so I had to do change it here: hidden_state, output = model(inp, hidden_state.detach()) however my model is not learning anything. Is that a problem with the architecture being too simple or is there some bug in the code?