Hi everyone,

I´m training an LSTM and I´m getting the following error:

RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [256, 1024]], which is output 0 of TBackward, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!

My code is as follows:

- Defining the net:

```
import torch.nn as nn
class LSTM(nn.Module):
#defining initialization method
def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, dropout=.5):
#class constructor
super(LSTM, self).__init__()
self.vocab_size = vocab_size
self.output_size = output_size
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.n_layers = n_layers
#defining the embedding layer
#produces a lower dimensional representation of the input vector
self.embed = nn.Embedding(vocab_size, embedding_dim)
#defining lstm cell
self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, dropout=dropout, batch_first = True)
self.dropout = nn.Dropout(0.2)
self.fc1 = nn.Linear(hidden_dim, output_size)
self.sigmoid = nn.Sigmoid()
#defining forward propagation
def forward(self, x, hidden):
#getting batch_size from input vector
batch_size = x.size(0)
#getting on with it
embeddings = self.embed(x)
out, ct = self.lstm(embeddings, hidden)
#stacking output
lstm_out = out.contiguous().view(-1, self.hidden_dim)
out = self.dropout(lstm_out)
out = self.fc1(out)
#horizontally stacking the predictions and getting the last one
out = out.view(batch_size, -1, self.output_size)
out = out[:, -1]
return out, ct
#defining method to initialize hidden states
def init_hidden(self, batch_size):
#getting weights of LSTM class
weight = next(self.parameters()).data
# initialize hidden state with zero weights, and move to GPU if available
#if (train_on_gpu):
#hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
#weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
#else:
hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
return hidden
```

- Instantiating the net:

```
n_epochs = 15
learning_rate = 0.0005
vocab_size = len(vocab_to_int)
output_size = len(vocab_to_int)
embedding_dim = 256
hidden_dim = 256
n_layers = 2
#instantiating lstm
lstm = LSTM(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)
#defining criterion
criterion = nn.CrossEntropyLoss()
#definig optimizer
optimizer = torch.optim.Adam(lstm.parameters(), lr=learning_rate)
```

- Training the net:

```
for epoch in range(1, n_epochs):
hidden = lstm.init_hidden(batch_size)
#now looping through data
for batch_i, (features, targets) in enumerate(trainloader, 1):
#watch out for incomplete batches
print(batch_i)
n_batches = len(trainloader.dataset) // batch_size
if (batch_i > n_batches):
break
print(features)
print(features.shape)
print(targets)
print(targets.shape)
#defining a forward pass
output, hidden = lstm(features, hidden)
lstm.zero_grad()
loss = criterion(output, targets)
loss.backward(retain_graph=True)
optimizer.step()
```

Any ideas? Thanks!