I want to train an LSTM neural network similar to how I do it in Keras with `stateful = True`

. The goal is to be able to **transmit the states between the sequences of the same batch and between the sequences of different batches**. This is the class I use for the LSTM module:

```
class LSTMStateful(nn.Module):
def __init__(self, input_size, hidden_size, batch_size,
bidirectional=False, **kwargs):
super().__init__()
self._hidden_state, self._hidden_cell = (None, None)
self._batch_size = batch_size
self._hidden_size = hidden_size
self._bidirectional = bidirectional
self.lstm = nn.LSTM(input_size, hidden_size,
bidirectional=bidirectional, **kwargs)
self.reset_hidden_cell()
self.reset_hidden_state()
@property
def batch_size(self):
return self._batch_size
@property
def bidirectional(self):
return self._bidirectional
@property
def hidden_size(self):
return self._hidden_size
@property
def hidden_cell(self):
return self._hidden_cell
@property
def hidden_state(self):
return self._hidden_state
def reset_hidden_cell(self):
self._hidden_cell = torch.zeros(self.lstm.num_layers * (self.bidirectional + 1),
self.batch_size, self.hidden_size)
def reset_hidden_state(self):
self._hidden_state = torch.zeros(self.lstm.num_layers * (self.bidirectional + 1),
self.batch_size, self.hidden_size)
def forward(self, input_seq):
lstm_out, (self._hidden_cell, self._hidden_state) = self.lstm(input_seq,
(self._hidden_cell, self._hidden_state))
return lstm_out, (self._hidden_cell, self._hidden_state)
```

So I create a simple two-layer stacked model to compare it to the Keras model:

```
class MyModel(nn.Module):
def __init__(self):
super().__init__()
self.lstm = LSTMStateful(input_size=1, batch_size=BATCH_SIZE,
hidden_size=HIDDEN_SIZE,
num_layers=2, batch_first=True)
self.linear = nn.Linear(HIDDEN_SIZE, 1)
def forward(self, x):
output, (hidden_cell, hidden_state) = self.lstm(x)
output = self.linear(output[:,-1:,:]) #return_sequences = False in Keras
return output
```

So I train the model. I use 1750 sequences of length 200 that are divided into batches of 250, that is `(250, 200, 1)`

with `batch_first = True`

```
model = MyModel()
loss_function = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
torch.autograd.set_detect_anomaly(True)
epochs = 1000
train_loss = []
eval_loss = []
for i in range(epochs):
model.zero_grad()
model.train()
for x_train, y_train in zip(X_train, Y_train):
# x_train shape (250, 200, 1)
# y_train shape (250, 200, 1)
y_pred = model(x_train)
single_loss = loss_function(y_pred, y_train)
loss_train_value=single_loss.item()
train_loss.append(loss_train_value)
single_loss.backward()
optimizer.step()
model.lstm.reset_hidden_cell()
model.lstm.reset_hidden_state()
model = model.eval()
with torch.no_grad():
for x_eval, y_eval in zip(X_eval, Y_eval):
y_pred_eval = model(x_eval)
single_loss_eval = loss_function(y_pred_eval, y_eval)
loss_eval_value=single_loss_eval.item()
eval_loss.append(loss_eval_value)
print(f'epoch: {i:3} loss: {loss_train_value:10.8f} eval loss: {loss_eval_value:10.8f}')
```

This takes around 30 seconds per epoch while the model in Keras (which I consider similar takes 5 seconds per epoch), in addition the loss of validation is too great compared to that obtained in Keras. (Both Keras and pytorch use the same training and validation data) Leaving aside the absence of initializations of weights and bias it is evident that I am doing something wrong. Could you help me?