LSTM :RuntimeError: one of the variables needed

Hallo I am a beginner and want to build a word prediction model with LSTM
but I get the following error and I cant find the cause

Thank you for your help

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.cuda.FloatTensor [1536, 200]] is at version 2; expected version 1 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
import torch
import torch.nn as nn
import string
import random
import sys
import unidecode
from torch.utils.tensorboard import SummaryWriter
import numpy as np
import re
import math
print(torch.__version__)

def char_tensor(string):
    tensor = torch.zeros(len(string)).long()
    for c in range(len(string)):
        tensor[c] = all_characters.index(string[c])
    return tensor

def clean_text(text):
    text = text.replace('\n',' ')
    text = re.sub('[^A-Za-z ]+', '', text)
    return text


def word_to_class(word,  word_dict):
    word_class = word_dict[word]
    word_class_t = torch.zeros(1) + word_class
    return word_class_t



def one_hot_encode(y,output_size):
    one_hot =  torch.zeros(output_size)
    one_hot[y] = 1
    return one_hot

def create_batches(X_train, y_train, choices, word_dict, output_size,  batch_size):
    x_batch = []
    y_batch = []
    for i in range(batch_size):
        idx = choices.pop()
        x_batch.append(char_tensor((X_train[idx])))
        num = word_to_class(y_data[idx], word_dict)
        one_hot = one_hot_encode(int(num), output_size)
        y_batch.append(one_hot)
    
    xt_batch = torch.stack(x_batch)
    yt_batch = torch.stack(y_batch)
                          
    return xt_batch, yt_batch, choices 

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(RNN, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers

        #self.embed = nn.Embedding(input_size, hidden_size)
        #self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=False)
        self.fc = nn.Linear(hidden_size, output_size)
        #self.softmax = nn.LogSoftmax(dim=1)
    
    def forward(self, x, hidden, cell):
        #out = self.embed(x)
        out, (hidden, cell) = self.lstm(x, (hidden, cell))
        output = self.fc(out)
        #output = self.softmax(output)
        return output, (hidden, cell)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.input_size).to(device)
        cell = torch.zeros(self.num_layers, batch_size, self.input_size).to(device)
        #hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        #cell = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device)
        return hidden, cell



class SentimentNet(nn.Module):
    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, device, drop_prob=0.5):
        super(SentimentNet, self).__init__()
        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        self.device = device
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=drop_prob, batch_first=True)
        self.dropout = nn.Dropout(drop_prob)
        self.fc = nn.Linear(hidden_dim, output_size)
        #self.sigmoid = nn.softmax()
        
    def forward(self, x, hidden):
        batch_size = x.size(0)
        x = x.long()
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        #lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        
        out = self.dropout(lstm_out)
        #out = self.fc(out)
        out = self.fc(out[:, -1, :])
        #out = out[:,-1]
        print("out of fc", out.shape)
        #out = self.sigmoid(out)
        
        #out = out.view(batch_size, -1)
        return out, hidden
    
    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device), weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().to(self.device))
        return hidden





# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = "cpu"
# Get characters from string.printable
all_characters = string.printable
n_characters = len(all_characters)
# Read large text file (Note can be any text file: not limited to just names)
text = unidecode.unidecode(open("wonderland.txt").read())
text_clean = clean_text(text)
text_clean = text_clean.split()
seq_length = 100
y_data = []
data = []
part = ''
for idx in range(0, len(text_clean) - 1):
    part += (text_clean[idx] + " ")
    new_length = len(part) + len(text_clean[idx+1])
    if new_length > seq_length:
        part =  part[:-1]
        part = part.rjust(100)
        y_data.append(text_clean[idx+1])
        data.append(part)
        part = ''
        continue
print("data size ", len(data))
len(y_data)
train_size = round(len(data) * 0.9)
X_train = data[:train_size]  
X_test = y_data[:train_size]
y_train = data[train_size:]
y_test = y_data[train_size:]
unique_word = set(text_clean)
output_size = len(set(text_clean))

seq_length = 100
num_epochs = 5000
batch_size = 16
hidden_size = 128*3
num_layers = 3
lr = 0.3

iterations  = math.floor(len(X_train)/ batch_size)
train_data_size = iterations * batch_size
choices = list(range(train_data_size))
word_dict = {word:idx for idx,word in enumerate(unique_word)}
#model = RNN(seq_length, hidden_size, num_layers,output_size).to(device)
model = SentimentNet(seq_length, output_size, 200, hidden_size, num_layers, device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()
#criterion = nn.BCELoss()

print("=> Starting training")
best_loss = float("inf")
for epoch in range(1, num_epochs + 1):
    hidden = model.init_hidden(batch_size=batch_size)
    epoch_loss = 0
    choices = list(range(train_data_size))
    
    for idx in range(iterations):
       
        
        x_batch, y_batch,  choices = create_batches(X_train, y_train, choices, word_dict, output_size, batch_size)
        x_batch = x_batch.to(device)
        y_batch = y_batch.to(device).long()
        #x_batch = x_batch.unsqueeze(0)
        print("Input size ", x_batch.shape)
        # output, (hidden, cell) = model(x_batch, hidden, cell)
        output, hidden = model(x_batch, hidden)
        print(output.shape)
        print("target", y_batch.shape)
        output = output.float()
        y_batch = y_batch.long()
        loss = criterion(output, torch.max(y_batch, 1)[1]).to(device)
        optimizer.zero_grad()
        loss.backward(retain_graph=True)
        optimizer.step()
        #epoch_loss += loss.item()
    epoch_loss = epoch_loss / iterations
    print("Epoch {} loss {}".format(epoch, epoch_loss ))

The error is most likely raised, as you are not detaching the hidden state, keep the computation graph alive via retain_graph=True, and manipulate the parameters inplace via optimizer.step().

Are you sure you need to use retain_graph=True? If not, remove it and detach the hidden state via:

hidden = (hidden[0].detach(), hidden[1].detach())

before passing it to the model.

Alternatively to @ptrblck suggestion, you might want to call

hidden = model.init_hidden(batch_size=batch_size) 

For each batch not just for each epoch.

1 Like

Thank you both
It really solved the issue