LSTM-PPO: Gradient of Lstm.weight_hh_l[k] is always zero


I’m implementing PPO with a recurrent actor and recurrent critic (both the actor and critic have a LSTM layer in them). The problem is, for both of the LSTM layers (in actor and critic) the gradient of weight_hh_l[k] is always zero, whereas the gradient of weight_ih_l[k] isn’t.

Can someone help ?

Here’s the code for the critic, the actor is very similar.

Important notes: please don’t mind the fact that I don’t normalize all the elements of x in function normalize(x). This is simply related to my environment where the last two elements of the state are already between 0 and 1. Also, the critic recieves both the action and the state as input for the forward function.

class CriticNetwork(nn.Module):
    def __init__(self, nb_time_steps, len_action, len_state, num_layers=1, dropout=0, orthogonal_init=False, cuda=False):
        super(CriticNetwork, self).__init__()
        self.len_action = len_action
        self.len_state = len_state
        self.num_layers = num_layers
        self.hidden_size = 1
        self.cuda = cuda
        self.nb_time_steps = nb_time_steps
        rnn_layer = nn.LSTM(input_size=len_state+self.len_action, hidden_size=self.hidden_size, num_layers=num_layers,
                            batch_first=True, dropout=dropout, device='cuda' if self.cuda else 'cpu')
        output_layer = nn.Linear(self.hidden_size, self.hidden_size, device='cuda' if self.cuda else 'cpu')
        if orthogonal_init:
            orthogonal_init_func([rnn_layer, output_layer])

        self.layers = nn.ModuleDict({'rnn_layer': rnn_layer,
                                     'output_layer': output_layer,
                                     'normalization_layer': nn.LayerNorm(len_state - 2, eps=1e-05,
        self.activations = nn.ModuleDict({'relu_0': nn.LeakyReLU(), 'tanh': nn.Tanh()})
        if cuda:

    def forward(self, x):  # x = action, state
        normalized_x = self.normalize(x)
        h_0 = Variable(torch.zeros(self.num_layers, normalized_x.size(0), self.hidden_size,
                                   device='cuda' if self.cuda else 'cpu'))  # hidden state
        c_0 = Variable(torch.zeros(self.num_layers, normalized_x.size(0), self.hidden_size,
                       device='cuda' if self.cuda else 'cpu'))  # internal state
        # Propagate input through LSTM
        output, (hn, cn) = self.layers['rnn_layer'](normalized_x, (h_0, c_0))  # lstm with input, hidden, and internal state
        hn_ = hn[-1].view(-1, self.hidden_size)  # reshaping the data for Dense layer next
        output_rnn = self.activations['relu_0'](self.layers['output_layer'](hn_))
        return self.activations['tanh'](torch.reshape(output_rnn, (self.nb_time_steps, 1)))

    def normalize(self, x): # x = action, state
        normalized_x = []
        for x_elem in x:
            normalized_state =[0:self.len_action],
                                          self.layers['normalization_layer'](x_elem[self.len_action:len(x_elem) - 2]),
        normalized_x = torch.stack(normalized_x)
        return torch.reshape(normalized_x, (normalized_x.shape[0], 1, normalized_x.shape[1]))

The orthogonal initialization function I wrote is:

def orthogonal_init_func(layers):
    for layer in layers:
        if type(layer) in [nn.GRU, nn.LSTM, nn.RNN]:
            for name, param in layer.named_parameters():
                if 'weight_ih' in name:
                elif 'weight_hh' in name:
                elif 'bias' in name:


So I solved this by initializing c_0 and h_0 with torch.ones rather than torch.zeros
The gradient isn’t zero anymore but the LSTM doesn’t really converge…

I’m still looking for ways to improve the LSTM performance