Hi, I am trying to train RNN and LSTM with some data for a regression problem. For some runs, they are trained OK, but for others (roughly 1 in 3 runs), they are not updated at all, and the train losses don’t change at all over all epochs, even though I used the exactly same data and code as the ones that are trained OK before.

To see what happens, I printed the gradients while training them, and it turned out that for the runs without any improvement in RNN / LSTM, the gradients were always 0.

Does anyone know why this happens?

For reference, the following is the snippet of my code.

LSTM model

```
class LSTM1(nn.Module):
def __init__(self, num_classes, input_size, hidden_size, num_layers, dropout=0):
super(LSTM2, self).__init__()
# define LSTM parameters
self.num_classes = num_classes #number of classes
self.num_layers = num_layers #number of layers
self.input_size = input_size #input size
self.hidden_size = hidden_size #hidden state
self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
num_layers=num_layers, batch_first=False, dropout=dropout)
# If batch_first is False, the input dimension is (L, N, H_in), where L is sequence
# size, N is batch size, H_in is # of input features (input size)
# https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
# To set dropout > 0, we need num_layers greater than 1
self.fc = nn.Linear(hidden_size, num_classes) #fully connected linear layer
self.relu = nn.ReLU()
self.drop = nn.Dropout(dropout)
def forward(self,x):
# x is assumed to be in (sequence_length, batch_size, input_features).
h_0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size) #hidden state
c_0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size) #internal state
# new cell state and hidden state are used for every new sequence
# 1 full sequence is dealt with in 1 call of forward()
# That's why initialized h_0 and c_0 are prepared for every call of forward()
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
# <- I can omit providing h_0 and c_0, since the default tensors provided to these
# are the same as h_0 and c_0, which are zero-tensors
# output contains all the hidden states of all time steps (h_0, h_1, h_2, ..., h_n),
# including hn. In LSTM, the hidden state tensor seems to be treated as output.
out = output.view(-1, self.hidden_size)
out = self.drop(out)
out = self.fc(out) # Final Output
# since out contains hidden states of all time steps as described in the comment above,
# this linear layer is applied to all the time steps
out = self.relu(out) #relu
return out
```

training code:

```
def train_test_rnn(rnn_obj, train_data_list, target_data_list, test_data, test_target_data,
epoch_size, learning_rate, epoch_num_per_log, grad_clip_val=1.0):
if type(train_data_list) != list:
raise RuntimeError("The train data must be a list of 3-D tensors, each of which corresponds to 1 sequence.")
# loss function is MSE
criterion = nn.MSELoss()
# optimizer is Adam
optimizer = torch.optim.Adam(rnn_obj.parameters(), lr=learning_rate)
train_loss_list = []
test_loss_list = []
# to activate nn.Dropout(), set the model to train mode
rnn_obj.train()
for epoch in range(epoch_size):
# to learn sequences in a random order (I don't know if this is effective)
perm = np.random.permutation(len(train_data_list))
total_train_loss = 0
# for each sequence, execute model update
for i in range(len(train_data_list)):
train_data = train_data_list[perm[i]]
target_data = target_data_list[perm[i]]
outputs = rnn_obj.forward(train_data) #forward pass
optimizer.zero_grad() # set previously accumulated gradients to 0
loss = criterion(outputs, target_data)
loss.backward() #calculates the loss of the loss function
# gradinet scaling
torch.nn.utils.clip_grad_norm_(rnn_obj.parameters(), grad_clip_val)
optimizer.step()
# Performs a single optimization step (parameter update).
total_train_loss += loss.item()
if epoch % epoch_num_per_log == 0 or epoch == epoch_size-1:
print("Epoch: %d, total train loss: %1.5f" % (epoch, total_train_loss))
print("### weights of linear layer:")
print(rnn_obj.fc.weight)
print("### weights of bias of linear layer:")
print(rnn_obj.fc.bias)
print("### gradients of linear layer:")
print(rnn_obj.fc.weight.grad)
print("### gradients of bias of linear layer:")
print(rnn_obj.fc.bias.grad)
rnn_obj.eval() # turn on eval() mode, to turn off drop out and other features
# first, loss with train data
total_train_loss = 0
for i in range(len(train_data_list)):
outputs = rnn_obj.forward(train_data_list[i])
loss = criterion(outputs, target_data_list[i])
total_train_loss += loss.item()
total_train_loss /= len(train_data_list)
train_loss_list.append(total_train_loss)
# second, loss with test data
outputs = rnn_obj.forward(test_data)
loss = criterion(outputs, test_target_data)
test_loss_list.append(loss.item())
rnn_obj.train() # return the state to train() mode, as training continues
train_loss_array = np.array(train_loss_list)
test_loss_array = np.array(test_loss_list)
return train_loss_array, test_loss_array
```

Outputs when the LSTM is updated:

## Epoch: 0, total train loss: 1739.00158

## weights of linear layer:

Parameter containing:

tensor([[-0.1375, 0.2081, 0.2704, -0.2028, -0.1826]], requires_grad=True)## weights of bias of linear layer:

Parameter containing:

tensor([0.1059], requires_grad=True)## gradients of linear layer:

tensor([[ 0.0359, -0.0308, -0.0049, 0.3072, -0.0360]])

## gradients of bias of linear layer:

tensor([-0.8488])

## Epoch: 20, total train loss: 1564.45506

## weights of linear layer:

Parameter containing:

tensor([[-0.4347, 0.4605, 0.6158, -0.4735, -0.5050]], requires_grad=True)## weights of bias of linear layer:

Parameter containing:

tensor([0.3092], requires_grad=True)## gradients of linear layer:

tensor([[ 0.4102, -0.4120, -0.3741, 0.4114, 0.3959]])

## gradients of bias of linear layer:

tensor([-0.4307])

Outputs when the LSTM model was not updated at all:

## Epoch: 0, total train loss: 1757.77188

## weights of linear layer:

Parameter containing:

tensor([[ 0.3965, -0.1526, -0.2210, 0.0598, -0.1824]], requires_grad=True)## weights of bias of linear layer:

Parameter containing:

tensor([-0.3026], requires_grad=True)## gradients of linear layer:

tensor([[0., 0., 0., 0., 0.]])

## gradients of bias of linear layer:

tensor([0.])

## Epoch: 20, total train loss: 1757.77188

## weights of linear layer:

Parameter containing:

tensor([[ 0.3965, -0.1526, -0.2210, 0.0598, -0.1824]], requires_grad=True)## weights of bias of linear layer:

Parameter containing:

tensor([-0.3026], requires_grad=True)## gradients of linear layer:

tensor([[0., 0., 0., 0., 0.]])

## gradients of bias of linear layer:

tensor([0.])