LSTM outputs NaN

I have created a simple LSTM for forecasting. This gives output form the very first epoch. Below is my LSTM architecture.

class LSTM(nn.Module):
 def __init__(self, input_size, hidden_dim, num_layers, output_dim):
     super(LSTM, self).__init__()
     self.hidden_dim = hidden_dim
     self.num_layers = num_layers
     self.output_dim = output_dim
     self.lstm = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
     self.fc = nn.Linear(hidden_dim, output_dim)

 def forward(self, x):
    # Initialize hidden state with zeros
     h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
     
    # Initialize cell state with zeros
     c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()
    
    # Propagate input through the LSTM
     out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

     out = self.fc(out[:, -1, :])
    
    # Return the output of the last time step
     return out
 
model = LSTM(16, 8, 1, 1)
   
epochs = 100

for epoch in range(epochs):
  for i, batch in enumerate(trainloader):
  # Extract the input and output tensors for the current batch
    X_batch, y_batch = batch

  # Pass the input batch to the LSTM and perform backpropagation
    outputs = model(X_batch)
  print(outputs)
    y_batch = torch.squeeze(y_batch, dim=1)
    print(y_batch)
    loss = criterion(outputs, y_batch)

  # Backward and optimize
   optimizer.zero_grad()
   loss.backward()
   nn.utils.clip_grad_norm_(model.parameters(), 1.0)

  
  optimizer.step()

  if (epoch+1) % 10 == 0:
      print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch+1, epochs, loss.item()))

I am using MSELoss and Adam Optimizer.

Checked for Null Values in training data but there was nothing
Using Adam Optimizer and set a very low learning rate. (High lr also gave NaN)
Even after using gradient clipping also grad norm and output shows NaN.

What else could be reason for the LSTM gradient and output to be NaN?

I don’t have access to your data, so I substituted static random data. Also, I set the loss function as L1loss and optimizer to Adam. And I’m not getting any errors or NaN values.

import torch
import torch.nn as nn

class LSTM(nn.Module):
    def __init__(self, input_size, hidden_dim, num_layers, output_dim):
        super(LSTM, self).__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.output_dim = output_dim
        self.lstm = nn.LSTM(input_size, hidden_dim, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)

    def forward(self, x):
        # Initialize hidden state with zeros
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # Initialize cell state with zeros
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_dim).requires_grad_()

        # Propagate input through the LSTM
        out, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))

        out = self.fc(out[:, -1, :])

        # Return the output of the last time step
        return out


model = LSTM(16, 8, 1, 1)
criterion = nn.L1Loss()
optimizer = torch.optim.Adam(model.parameters(),lr=0.0001)
epochs = 100
X_batch, y_batch = torch.rand((10,8,16)), torch.randn(10)
for epoch in range(epochs):
    for i in range(100):
        # Extract the input and output tensors for the current batch


        # Pass the input batch to the LSTM and perform backpropagation
        outputs = model(X_batch)
        #print(outputs)
        #y_batch = torch.squeeze(y_batch, dim=1)
        #print(y_batch)
        loss = criterion(outputs.view(-1), y_batch)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        optimizer.step()

    if (epoch + 1) % 10 == 0:
        print('Epoch [{}/{}], Loss: {:.4f}'.format(epoch + 1, epochs, loss.item()))

Try checking your data with a simple check:

if (torch.any(torch.isnan(X_batch)))|(torch.any(torch.isnan(y_batch))):
    print("NaN values found in", epoch, i)
    break

Thanks! Yes actually there was NaN in the data. Above code helped to spot that. Getting very high loss value. Do u have any tricks to reduce the loss? I tried scaling the data but more or less loss value is same.