Hi, I’m a new person to PyTorch, and am trying to learn a basic RNN from scratch in PyTorch, but I keep getting this runtimeerror. I’ve seen multiple threads about this, and I don’t think I have any in-place operators like += or []. Why is this happening? Below is my entire error message, entire code, and the backtrace error message for the detect anomaly. I’d appreciate the help, thank you so much!
Error Message:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [186, 18]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
Code:
import torch
from utils import *
import pdb
import json
import math
# import more functions needed on top of the given imported functions above:
from sklearn.model_selection import train_test_split
import numpy as np
import torch.nn as nn
from matplotlib import pyplot as plt
import time
from sklearn.metrics import confusion_matrix, accuracy_score
import seaborn as sns
import pandas as pd
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") #run on gpu if possible
n_letters=58
n_categories=18
n_hidden = 128
n_epochs = 100 # changed from 100 to 50 for faster training
print_every = 5000
plot_every = 1000
learning_rate = 0.0005 # changed from 0.005 to 0.001
class RNN(torch.nn.Module):
def __init__(self, input_size, hidden_size, output_size):
"""
Args:
input_shape (int): size of the 1-hot embeddings for each character (this will be 58)
hidden_layer_width (int): number of nodes in the single hidden layer within the model
n_classes (int): number of output classes
"""
super(RNN, self).__init__()
### TODO Implement the network architecture
self.hidden_size = hidden_size
self.input_to_hidden = nn.Linear(input_size + hidden_size, hidden_size)
self.input_to_output = nn.Linear(input_size + hidden_size, output_size)
self.softmax = nn.Softmax(dim = 1)
# raise NotImplementedError
def forward(self, input, hidden):
"""Forward function accepts tensor of input data, returns tensor of output data.
Modules defined in constructor are used, along with arbitrary operators on tensors
"""
### TODO Implement the forward function
input = input.to(device)
hidden = hidden.to(device)
concat = torch.cat((input, hidden), dim = 1)
concat = concat.to(device)
hidden = torch.nn.functional.softmax(self.input_to_hidden(concat))
output = self.input_to_output(concat)
output = output.to(device)
output = self.softmax(output)
# your function will return the output y(t) and hidden h(t) from equation 1 in the docs
return output, hidden
# raise NotImplementedError
def initHidden(self):
"""
This function initializes the first hidden state of the RNN as a zero tensor.
"""
return torch.zeros(1, self.hidden_size)
def get_xy_pairs(names):
#TODO
#process the names dict and convert into a list of (x,y) pairs. x is a 1-hot tensor of size (num_characters_in_name, 1, n_letters)
#y is a scalar representing the category of the language, there are 18 languages, assign an index between 0-17 to each language and y represents this index.
#you may make use of the nameToTensor() function in the utils.py file to help you with this function
# Turn a line into a <line_length x 1 x n_letters>,
# or an array of one-hot letter vectors
y_list = [item for item in names.keys()] #keys
x_list = [item for item in names.values()] #values
label_y_list = [idx for (idx, item) in enumerate(y_list)] # 0-17
x_list_length = [nameToTensor(x_list[idx1]).shape[0] for idx1 in range(len(label_y_list))] #num of names of each language
label_y_list = [np.repeat(label_y_list[i], x_list_length[i]) for i in range(len(label_y_list))] #repeat
label_y_list = np.array(np.concatenate(label_y_list).flat) #flatten
x_list = np.array(np.concatenate(x_list).flat) #flatten
name_x_list = [nameToTensor(x_list[i]) for i in range(len(x_list))] #tensor
list_of_pairs = list(zip(name_x_list, label_y_list)) #combine into list
return list_of_pairs
def create_train_and_test_set(list_of_pairs):
#TODO
#process the list of (x,y) pairs and split them 80-20 into train and test set
#train_x is a list of name embeddings each of size (num_characters_in_name, 1, n_letters), train_y is the correponding list of language category index. Same for test_x and test_y
x_list = [i for i, j in list_of_pairs]
y_list = [j for i, j in list_of_pairs]
train_x, test_x, train_y, test_y = train_test_split(x_list, y_list, test_size=0.2, random_state = 42, stratify = y_list)
return train_x, train_y, test_x, test_y
rnn = RNN(n_letters, n_hidden, n_categories)
rnn.to(device)
optimizer = torch.optim.SGD(rnn.parameters(), lr=learning_rate)
criterion = nn.NLLLoss()
torch.autograd.set_detect_anomaly(True)
def train(train_x, train_y):
"""train_x and train_y are lists with names and correspoonding labels"""
output_ra = []
loss = 0
for x, y in zip(train_x, train_y):
hidden = rnn.initHidden()
for i in range(x.size()[0]):
output, hidden = rnn(x[i].clone(), hidden.clone())
loss = loss + criterion(torch.log(output.to(device) + 1e-6), torch.tensor(y, dtype= torch.long).unsqueeze(0).to(device))
loss = loss.to(device)
optimizer.zero_grad()
loss.backward(retain_graph = True)
optimizer.step()
output_ra.extend(output.detach().cpu().numpy())
return loss, output_ra
# for i in range(x.size()[0]):
# output, hidden = rnn(x[i], hidden)
# loss += criterion(torch.log(output.to(device) + 1e-6), torch.tensor(y, dtype= torch.long).unsqueeze(0).to(device)) #the unsqueeze converts the scalar y to a 1D tensor
# loss = loss.to(device)
#
# output_ra.extend(output.detach().cpu().numpy())
# optimizer.zero_grad()
#
# loss.backward()
#
# optimizer.step()
#
# return loss, output_ra
def test(train_x, train_y):
"""train_x and train_y are lists with names and correspoonding labels"""
loss = 0
output_ra = []
with torch.no_grad():
for x, y in zip(train_x, train_y):
hidden = rnn.initHidden()
for i in range(x.size()[0]):
output, hidden = rnn(x[i], hidden)
loss += criterion(torch.log(output.to(device) + 1e-6), torch.tensor(y, dtype= torch.long).unsqueeze(0).to(device)) #the unsqueeze converts the scalar y to a 1D tensor
output_ra.extend(output.detach().cpu().numpy()) #also save the predicted labels at the last epoch for the confusion matrix
return loss, output_ra
#names is your dataset in python dictionary form. Keys are languages and values are list of words belonging to that language
with open('names.json', 'r') as fp:
names = json.load(fp)
list_of_pairs = get_xy_pairs(names)
train_x, train_y, test_x, test_y = create_train_and_test_set(list_of_pairs)
I get this error message when I call
for epoch in range(1, n_epochs + 1):
current_train_loss, train_pred = train(train_x, train_y)
Error message:
C:\Users\choke\anaconda3\lib\site-packages\torch\autograd\__init__.py:173: UserWarning: Error detected in AddmmBackward0. Traceback of forward call that caused the error:
File "C:\Users\choke\anaconda3\lib\runpy.py", line 197, in _run_module_as_main
return _run_code(code, main_globals, None,
File "C:\Users\choke\anaconda3\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel_launcher.py", line 16, in <module>
app.launch_new_instance()
File "C:\Users\choke\anaconda3\lib\site-packages\traitlets\config\application.py", line 846, in launch_instance
app.start()
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelapp.py", line 677, in start
self.io_loop.start()
File "C:\Users\choke\anaconda3\lib\site-packages\tornado\platform\asyncio.py", line 199, in start
self.asyncio_loop.run_forever()
File "C:\Users\choke\anaconda3\lib\asyncio\base_events.py", line 601, in run_forever
self._run_once()
File "C:\Users\choke\anaconda3\lib\asyncio\base_events.py", line 1905, in _run_once
handle._run()
File "C:\Users\choke\anaconda3\lib\asyncio\events.py", line 80, in _run
self._context.run(self._callback, *self._args)
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 471, in dispatch_queue
await self.process_one()
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 460, in process_one
await dispatch(*args)
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 367, in dispatch_shell
await result
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\kernelbase.py", line 662, in execute_request
reply_content = await reply_content
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\ipkernel.py", line 360, in do_execute
res = shell.run_cell(code, store_history=store_history, silent=silent)
File "C:\Users\choke\anaconda3\lib\site-packages\ipykernel\zmqshell.py", line 532, in run_cell
return super().run_cell(*args, **kwargs)
File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2863, in run_cell
result = self._run_cell(
File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 2909, in _run_cell
return runner(coro)
File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
coro.send(None)
File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3106, in run_cell_async
has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3309, in run_ast_nodes
if await self.run_code(code, result, async_=asy):
File "C:\Users\choke\anaconda3\lib\site-packages\IPython\core\interactiveshell.py", line 3369, in run_code
exec(code_obj, self.user_global_ns, self.user_ns)
File "C:\Users\choke\AppData\Local\Temp\ipykernel_4776\3138297363.py", line 8, in <cell line: 7>
current_train_loss, train_pred = train(train_x, train_y)
File "C:\Users\choke\AppData\Local\Temp\ipykernel_4776\706923590.py", line 111, in train
output, hidden = rnn(x[i].clone(), hidden.clone())
File "C:\Users\choke\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\choke\AppData\Local\Temp\ipykernel_4776\706923590.py", line 54, in forward
output = self.input_to_output(concat)
File "C:\Users\choke\anaconda3\lib\site-packages\torch\nn\modules\module.py", line 1130, in _call_impl
return forward_call(*input, **kwargs)
File "C:\Users\choke\anaconda3\lib\site-packages\torch\nn\modules\linear.py", line 114, in forward
return F.linear(input, self.weight, self.bias)
(Triggered internally at C:\cb\pytorch_1000000000000\work\torch\csrc\autograd\python_anomaly_mode.cpp:104.)