Dear PyTorch experts,
I am trying to build an lstm model for multivariate regression, but the model never converge:
class RNN(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size, num_classes)
def forward(self, x, h0, c0):
# Forward propagate LSTM
out, _ = self.lstm(x, (h0, c0)) # out: tensor of shape (batch_size, seq_length, hidden_size)
# Decode the hidden state of the last time step
out = self.fc(out[:,-1,:])
#out = F.relu(out)
return out
in_features = 3
hidden_size = 128
nb_rnn_layers = 1
nb_classes = 3
net = RNN(in_features, hidden_size, nb_rnn_layers, nb_classes).to(device)
print (net)
and this is an example of the input, where it has 48 time steps and the output is 3 variables
(tensor([[-45.2884, 11.8089, 294.8820],
[-45.4233, 11.8189, 295.8110],
[-45.9532, 11.8584, 299.4550],
[-46.0674, 11.8667, 300.2410],
[-46.6647, 11.9099, 304.3450],
[-46.7999, 11.9195, 305.2740],
[-47.3948, 11.9620, 309.3650],
[-47.5298, 11.9717, 310.2930],
[-49.9462, 12.1501, 326.9390],
[-50.0599, 12.1581, 327.7240],
[-50.5900, 12.1942, 331.3760],
[-50.7246, 12.2029, 332.3040],
[-51.3173, 12.2397, 336.4020],
[-51.4513, 12.2479, 337.3300],
[-52.0412, 12.2824, 341.4230],
[-52.1542, 12.2888, 342.2090],
[-58.4874, 12.5248, 393.9820],
[-58.5616, 12.5250, 394.8920],
[-59.0036, 12.5226, 400.9410],
[-59.0617, 12.5216, 401.8450],
[-59.6704, 12.4927, 415.4860],
[-59.6920, 12.4901, 416.3720],
[-59.7806, 12.4674, 422.9640],
[-59.7824, 12.4639, 423.8400],
[-59.5155, 12.4011, 437.4980],
[-59.4805, 12.3966, 438.3500],
[-59.1347, 12.3609, 444.9350],
[-59.0766, 12.3565, 445.8370],
[-57.8999, 12.2859, 459.5360],
[-57.8125, 12.2814, 460.3470],
[-57.0362, 12.2443, 466.9300],
[-56.9168, 12.2389, 467.8540],
[-11.5856, 10.5807, 606.8630],
[-11.1563, 10.5598, 607.7730],
[ -9.4675, 10.4768, 611.3410],
[ -9.0352, 10.4551, 612.2510],
[ -7.0042, 10.3541, 616.5160],
[ -6.5702, 10.3325, 617.4250],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, 0.0000]]),
tensor([[-0.1018, 0.1386, 0.9901]]))
and the training loop
# Train the model
total_step = len(dataloader)
kEpochs = 100
for epoch in range(kEpochs):
for i_batch, (inputs, labels) in enumerate(dataloader):
# Forward pass
inputs = inputs.reshape(48, batch_size, 3).to(device)
labels = labels.reshape(1, batch_size).to(device)
h00 = torch.zeros(nb_rnn_layers, inputs.size(0), hidden_size).to(device)
c00 = torch.zeros(nb_rnn_layers, inputs.size(0), hidden_size).to(device)
outputs = net.forward(inputs, h00, c00)
loss = criterion(outputs, labels)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' .format(epoch+1, kEpochs, i_batch+1, total_step, loss.item()))
Also I have used the same architecture for classification by putting sigmoid
on top of the linear layer for a different task, and it worked perfectly.
What I dont understand is that the same model converges in Keras, can anyone help me in understanding where is the mistake?
Thank you very much