0 gradients in training RNN and LSTM for some runs

Takahiro · August 15, 2022, 10:04am

Hi, I am trying to train RNN and LSTM with some data for a regression problem. For some runs, they are trained OK, but for others (roughly 1 in 3 runs), they are not updated at all, and the train losses don’t change at all over all epochs, even though I used the exactly same data and code as the ones that are trained OK before.
To see what happens, I printed the gradients while training them, and it turned out that for the runs without any improvement in RNN / LSTM, the gradients were always 0.
Does anyone know why this happens?
For reference, the following is the snippet of my code.
LSTM model

class LSTM1(nn.Module):
    def __init__(self, num_classes, input_size, hidden_size, num_layers, dropout=0):
        super(LSTM2, self).__init__()
        
        # define LSTM parameters
        self.num_classes = num_classes #number of classes
        self.num_layers = num_layers #number of layers
        self.input_size = input_size #input size
        self.hidden_size = hidden_size #hidden state
        
        self.lstm = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                            num_layers=num_layers, batch_first=False, dropout=dropout)
        # If batch_first is False, the input dimension is (L, N, H_in), where L is sequence
        # size, N is batch size, H_in is # of input features (input size)
        # https://pytorch.org/docs/stable/generated/torch.nn.LSTM.html
        # To set dropout > 0, we need num_layers greater than 1
        
        self.fc =  nn.Linear(hidden_size, num_classes) #fully connected linear layer
                
        self.relu = nn.ReLU() 

        self.drop = nn.Dropout(dropout)
    
    def forward(self,x):
        # x is assumed to be in (sequence_length, batch_size, input_features).
        
        h_0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size) #hidden state
        c_0 = torch.zeros(self.num_layers, x.size(1), self.hidden_size) #internal state
        # new cell state and hidden state are used for every new sequence
        # 1 full sequence is dealt with in 1 call of forward()
        # That's why initialized h_0 and c_0 are prepared for every call of forward()
        
        # Propagate input through LSTM
        output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
        # <- I can omit providing h_0 and c_0, since the default tensors provided to these
        #    are the same as h_0 and c_0, which are zero-tensors
        # output contains all the hidden states of all time steps (h_0, h_1, h_2, ..., h_n),
        # including hn. In LSTM, the hidden state tensor seems to be treated as output.

        out = output.view(-1, self.hidden_size)
        out = self.drop(out)

        out = self.fc(out) # Final Output
        # since out contains hidden states of all time steps as described in the comment above,
        # this linear layer is applied to all the time steps
        out = self.relu(out) #relu
        
        return out

training code:

def train_test_rnn(rnn_obj, train_data_list, target_data_list, test_data, test_target_data,
                   epoch_size, learning_rate, epoch_num_per_log, grad_clip_val=1.0):
    if type(train_data_list) != list:
        raise RuntimeError("The train data must be a list of 3-D tensors, each of which corresponds to 1 sequence.")

    # loss function is MSE
    criterion = nn.MSELoss()
        
    # optimizer is Adam
    optimizer = torch.optim.Adam(rnn_obj.parameters(), lr=learning_rate)

    train_loss_list = []
    test_loss_list = []
    
    # to activate nn.Dropout(), set the model to train mode
    rnn_obj.train()
    
    for epoch in range(epoch_size):

        # to learn sequences in a random order (I don't know if this is effective)
        perm = np.random.permutation(len(train_data_list))
        
        total_train_loss = 0
        
        # for each sequence, execute model update
        for i in range(len(train_data_list)):
            train_data = train_data_list[perm[i]]
            target_data = target_data_list[perm[i]]
                        
            outputs = rnn_obj.forward(train_data) #forward pass
            
            optimizer.zero_grad() # set previously accumulated gradients to 0
            
            loss = criterion(outputs, target_data)
                       
            loss.backward() #calculates the loss of the loss function
            
            # gradinet scaling
            torch.nn.utils.clip_grad_norm_(rnn_obj.parameters(), grad_clip_val)
                        
            optimizer.step() 
            # Performs a single optimization step (parameter update).
            
            total_train_loss += loss.item()
            
        if epoch % epoch_num_per_log == 0 or epoch == epoch_size-1:           
            print("Epoch: %d, total train loss: %1.5f" % (epoch, total_train_loss))
           
            print("### weights of linear layer:")
            print(rnn_obj.fc.weight)
            print("### weights of bias of linear layer:")
            print(rnn_obj.fc.bias)
            print("### gradients of linear layer:")
            print(rnn_obj.fc.weight.grad)
            print("### gradients of bias of linear layer:")
            print(rnn_obj.fc.bias.grad)
            
            rnn_obj.eval()      # turn on eval() mode, to turn off drop out and other features
            
            # first, loss with train data
            total_train_loss = 0
            for i in range(len(train_data_list)):
                outputs = rnn_obj.forward(train_data_list[i])
                loss = criterion(outputs, target_data_list[i])
                total_train_loss += loss.item()
            total_train_loss /= len(train_data_list)
            train_loss_list.append(total_train_loss)

            # second, loss with test data
            outputs = rnn_obj.forward(test_data)
            loss = criterion(outputs, test_target_data)
            test_loss_list.append(loss.item())

            rnn_obj.train()     # return the state to train() mode, as training continues

    train_loss_array = np.array(train_loss_list)
    test_loss_array = np.array(test_loss_list)
    return train_loss_array, test_loss_array

Outputs when the LSTM is updated:

Epoch: 0, total train loss: 1739.00158

weights of linear layer:

Parameter containing:
tensor([[-0.1375, 0.2081, 0.2704, -0.2028, -0.1826]], requires_grad=True)

weights of bias of linear layer:

Parameter containing:
tensor([0.1059], requires_grad=True)

gradients of linear layer:

tensor([[ 0.0359, -0.0308, -0.0049, 0.3072, -0.0360]])

gradients of bias of linear layer:

tensor([-0.8488])

Epoch: 20, total train loss: 1564.45506

weights of linear layer:

Parameter containing:
tensor([[-0.4347, 0.4605, 0.6158, -0.4735, -0.5050]], requires_grad=True)

weights of bias of linear layer:

Parameter containing:
tensor([0.3092], requires_grad=True)

gradients of linear layer:

tensor([[ 0.4102, -0.4120, -0.3741, 0.4114, 0.3959]])

gradients of bias of linear layer:

tensor([-0.4307])

Outputs when the LSTM model was not updated at all:

Epoch: 0, total train loss: 1757.77188

weights of linear layer:

Parameter containing:
tensor([[ 0.3965, -0.1526, -0.2210, 0.0598, -0.1824]], requires_grad=True)

weights of bias of linear layer:

Parameter containing:
tensor([-0.3026], requires_grad=True)

gradients of linear layer:

tensor([[0., 0., 0., 0., 0.]])

gradients of bias of linear layer:

tensor([0.])

Epoch: 20, total train loss: 1757.77188

weights of linear layer:

Parameter containing:
tensor([[ 0.3965, -0.1526, -0.2210, 0.0598, -0.1824]], requires_grad=True)

weights of bias of linear layer:

Parameter containing:
tensor([-0.3026], requires_grad=True)

gradients of linear layer:

tensor([[0., 0., 0., 0., 0.]])

gradients of bias of linear layer:

tensor([0.])