I have created a simple LSTM for forecasting. This gives output form the very first epoch. Below is my LSTM architecture.
class LSTM(nn.Module):
def __init__(self, input_size, hidden_dim, num_layers, output_dim):
super(LSTM, self).__init__()
self.hidden_dim = hidden_dim
self.num_layers = num_layers
self.output_dim = output_dim
self.lstm = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
# Initialize hidden state with zeros
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
# Initialize cell state with zeros
c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
# Propagate input through the LSTM
out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
out = self.fc(out[:, -1, :])
# Return the output of the last time step
return out
model = LSTM(16, 8, 1, 1)
epochs = 100
for epoch in range(epochs):
for i, batch in enumerate(trainloader):
# Extract the input and output tensors for the current batch
X_batch, y_batch = batch
# Pass the input batch to the LSTM and perform backpropagation
outputs = model(X_batch)
print(outputs)
y_batch = torch.squeeze(y_batch, dim=1)
print(y_batch)
loss = criterion(outputs, y_batch)
# Backward and optimize
optimizer.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), 1.0)
optimizer.step()
if (epoch+1) % 10 == 0:
print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss.item()))
I am using MSELoss and Adam Optimizer.
Checked for Null Values in training data but there was nothing
Using Adam Optimizer and set a very low learning rate. (High lr also gave NaN)
Even after using gradient clipping also grad norm and output shows NaN.
What else could be reason for the LSTM gradient and output to be NaN?