Implement Keras Stateful-LSTM model to Pytorch

Hi, I am a kind of Newb in pytorch :slight_smile:

What I’m trying to do is a time series prediction model.
After many trials and errors, I found the Keras code I wanted and tried to apply it to the pytorch.

The main point of the Keras model is set to stateful = True, so I also used the hidden state and cell state values ​​of the previous mini-batch without initializing the values ​​of the hidden state and cell state.

But my code does not drop the loss value at all, and the prediction value seems very strange. I have no idea what’s wrong. What I want to do is to start with the most basic model, so I set the value of look-ahead day and look-back day to 1.(It is the same for the Keras code.) Feature dimension is also 1.

So a shape of input_X should be (1, batch_size, 1)

Anyone, please take a look my code and point out why loss value does not reduce. The summary of keras model and pytorch code is below.

# Keras Model

def build_model(self):
    # first add input to hidden1
    self.model.add(LSTM(
        units=120,
        batch_input_shape=(672,1,1),
        stateful=True)
    self.model.add(Dropout(0.1))
    
    # add dense layer with output dimension to get output for one time_step
    self.model.add(Dense(units=1))

    # Repeat for look_ahead steps to get outputs for look_ahead timesteps.
    self.model.add(RepeatVector(1)

    # add activation
    self.model.add(Activation("linear"))

    # compile model and print summary
    self.model.compile(loss="MSE", optimizer=Adam(lr=0.02,decay= .99))

    return self.model

And my pytorch code

Xtrain = torch.load('X_train.pt')
ytrain = torch.load('y_train.pt')

batch_size = 672

remained = len(Xtrain) % 672  # len(Xtrain) = 22846
new_len = len(Xtrain) - remained # new_len = 22176

# In a stateful network, you should only pass inputs with a number of samples that can be divided by the batch size
X_train = Xtrain[:new_len]
y_train = ytrain[:new_len]

# transpose for input shape
X_train = np.transpose(X_train, (1, 0, 2))
batch_X_train = X_train.reshape(-1,672,1)

y_train = np.transpose(y_train, (1, 0, 2))
batch_y_train = y_train.reshape(-1,672,1)

print(batch_X_train.shape) # (33, 672, 1)
print(batch_y_train.shape) # (33, 672, 1)

trainX_tensor = torch.FloatTensor(batch_X_train)
trainY_tensor = torch.FloatTensor(batch_y_train)

trainData = TensorDataset(trainX_tensor, trainY_tensor)
class LSTM(nn.Module):

    def __init__(self, input_dim, hidden_size, out_dim, num_layers, batch_size, dropout, device):
        super(LSTM, self).__init__()

        self.batch_size = batch_size
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.dropout_p = dropout
        self.device = device

        self.lstm = nn.LSTM(input_dim, hidden_size, num_layers=num_layers)
        self.fc1 = nn.Linear(hidden_size, hidden_size//2)
        self.dropout = nn.Dropout(p = self.dropout_p)
        self.fc2 = nn.Linear(hidden_size//2,out_dim)

    def forward(self, x, hidden, cell):
        lstm_out, (hidden, cell) = self.lstm(x, (hidden, cell))

        out = self.fc1(lstm_out)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)

        return out, hidden, cell
epoch = 5
input_dim = 1
out_dim = 1
hidden_size = 120
num_layers = 1
batch_size = 672
dropout = 0.1
learning_rate = .02

loss_function = nn.MSELoss()

model = LSTM(input_dim, hidden_size, out_dim, num_layers, batch_size, dropout, device).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=.99)

# initialize hidden and cell state
def hidden_state_init(stack_num, bsz, hidden):
    hw = torch.empty(stack_num, bsz, hidden)
    cw = torch.empty(stack_num, bsz, hidden)
    h_state = torch.nn.init.xavier_uniform_(hw)
    c_state = torch.nn.init.xavier_uniform_(cw)
    return h_state,c_state
for epoch_num in range(epoch):
    model.train()
    y_prediction = []
    loss_list = []
    average_loss = 0
    h_state, c_state=hidden_state_init(num_layers, batch_size,hidden_size)
    
    for idx, data in enumerate(trainData):

        X, y = data
        batch_size = X.shape[0]
        X = X.reshape(-1, batch_size, 1)

        # For stateful
        h_state = h_state.detach()
        c_state = c_state.detach()

        optimizer.zero_grad()

        y_pred, h_state, c_state  = model(X, h_state, c_state)
        y_prediction.append(y_pred)

        loss = loss_function(y_pred, y)

        loss.backward()
        optimizer.step()

        average_loss += (loss.item() / 33)
        loss_list.append(loss)

    print(average_loss)
   1.0038617777101921
   1.0039293639587632
   1.003980219364166
   1.0042506131258877
   1.0043293851794615


I’m not sure what I have to do now. Any suggestion would be really appreciated.