Gradients are always zero

I have designed a DNN which has the following structure:

class my_Net(nn.Module):
    def __init__(self, device='cuda:0'):
        super(my_Net, self).__init__()

        self.device = device
        self.tdmp = TimeDistributed(step = 81)
        self.bncnn3drelu = BnCNN3DReLU(input_channels=1, output_channels=32)
        self.cnn3drelu_1 = CNN3DReLU(input_channels=32, output_channels=64)
        self.cnn3drelu_2 = CNN3DReLU(input_channels=64, output_channels=128)
        self.cnn3drelu_4  = CNN3DReLU(input_channels=128, output_channels=256)
        self.convlstm = ConvLSTM(input_size=(3, 3), input_channels = 256, hidden_dim=[16, 16, 16],
                 kernel_size=(3, 3),
                 target=[[80, 2]],
                 return_all_layers=False, device = self.device)
        self.cnn2dup_1 = UPCNN2DReLU(size=(3 ,3), input_channels=16, output_channels=1)

    def forward(self, x):
        _ = self.bncnn3drelu(x)
        _ = self.cnn3drelu_1(_)
        _ = self.tdmp(_)
        _ = self.cnn3drelu_2(_)
        _ = self.tdmp(_)
        _ = self.cnn3drelu_3(_)
        _ = self.tdmp(_)
        _ = self.cnn3drelu_4(_)
        _ = self.tdmp(_)
        _ = self.convlstm(_)
        _ = self.cnn2dup_1(_)
        _ = F.interpolate(_, (61,61))
        return _

where TimeDistributed, BnCNN3DReLU, CNN3DReLU, ConvLSTM, and UPCNN2DReLU are"Apply maxpool2d to each fram of time", “batch-normalization+3D CNN + ReLU”, “3D CNN + ReLU”, “convolutional LSTM” and "2D CNN + ReLU +Upsampling ", respectively.

My data is a tensor with batch_size, num_channel, time, heights, width = data.size(). No matter which loss function I have, the gradients are always zero. I’m training the network using the following codes:

out = model(data)

loss = criterion_b(out, label) 
optimizer = optim.Adam(model.parameters(),


I think optimizer should be defined before the forward function.

optimizer = optim.Adam(model.parameters(),

out = model(data)
loss = criterion_b(out, label) 

Hope this helps.

Thanks for your reply. Unfortunately it does not work yet.

I actually thick that the convLSTM causes the problem. Even, when I have a single convLSTM as my model, in the first iteration, only the outer layers have gradient and the rest are zero. See figure1.


If I keep doing the calculation, again, the gradients all go to zero! See figure 2.


I did not find any built-in function for convLSTM and I’m using the following code to have them. I do appreciate your comments if you have better code, or tell me if there is any problem in the code.

class ConvLSTMCell(nn.Module):

    def __init__(self, input_size, input_dim, hidden_dim, kernel_size, bias, device):
        Initialize ConvLSTM cell.
        input_size: (int, int)
            Height and width of input tensor as (height, width).
        input_dim: int
            Number of channels of input tensor.
        hidden_dim: int
            Number of channels of hidden state.
        kernel_size: (int, int)
            Size of the convolutional kernel.
        bias: bool
            Whether or not to add the bias.

        super(ConvLSTMCell, self).__init__()

        self.height, self.width = input_size
        self.input_dim  = input_dim
        self.hidden_dim = hidden_dim

        self.kernel_size = kernel_size
        self.padding     = kernel_size[0] // 2, kernel_size[1] // 2
        self.bias        = bias
        self.conv = nn.Conv2d(in_channels=self.input_dim + self.hidden_dim,
                              out_channels=4 * self.hidden_dim,
        self.device = device

    def forward(self, input_tensor, cur_state):
        h_cur, c_cur = cur_state
        combined =[input_tensor, h_cur], dim=1).to(self.device) # concatenate along channel axis

        combined_conv = self.conv(combined).to(self.device)
        cc_i, cc_f, cc_o, cc_g = torch.split(combined_conv, self.hidden_dim, dim=1) 
        i = torch.sigmoid(cc_i)
        f = torch.sigmoid(cc_f)
        o = torch.sigmoid(cc_o)
        g = torch.tanh(cc_g)

        c_next = f * c_cur + i * g
        h_next = o * torch.tanh(c_next)
        return h_next, c_next

    def init_hidden(self, batch_size):
        return (Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width), requires_grad = True).to(self.device),
                Variable(torch.zeros(batch_size, self.hidden_dim, self.height, self.width), requires_grad = True).to(self.device))

class ConvLSTM(nn.Module):

    def __init__(self, input_size, input_channels, hidden_dim, kernel_size, num_layers,
                 target=[[0,0]], bias=True, return_all_layers=False, device='cuda:0'):
        super(ConvLSTM, self).__init__()

        self.device = device = target

        # Make sure that both `kernel_size` and `hidden_dim` are lists having len == num_layers
        kernel_size = self._extend_for_multilayer(kernel_size, num_layers)
        hidden_dim  = self._extend_for_multilayer(hidden_dim, num_layers)
        if not len(kernel_size) == len(hidden_dim) == num_layers:
            raise ValueError('Inconsistent list length.')

        self.height, self.width = input_size

        self.input_dim  = input_channels
        self.hidden_dim = hidden_dim
        self.kernel_size = kernel_size
        self.num_layers = num_layers
        self.bias = bias
        self.return_all_layers = return_all_layers

        cell_list = []
        for i in range(0, self.num_layers):
            cur_input_dim = self.input_dim if i == 0 else self.hidden_dim[i-1]

            cell_list.append(ConvLSTMCell(input_size=(self.height, self.width),
                                          device = self.device))

        self.cell_list = nn.ModuleList(cell_list)

    def forward(self, input_tensor, hidden_state=None):

        # Implement stateful ConvLSTM
        if hidden_state is not None:
            raise NotImplementedError()
            hidden_state = self._init_hidden(batch_size=input_tensor.size(0))

        layer_output_list = []
        last_state_list   = []
        out = torch.tensor([]).to(self.device)

        seq_len = input_tensor.size(2)
        cur_layer_input = input_tensor

        for layer_idx in range(self.num_layers):

            h, c = hidden_state[layer_idx]
            output_inner = []
            for t in range(seq_len):

                h, c = self.cell_list[layer_idx](input_tensor=cur_layer_input[:, :, t, :, :],
                                                 cur_state=[h, c])
            layer_output = torch.stack(output_inner, dim=2)
            cur_layer_input = layer_output

            last_state_list.append([h, c])
            if [t, layer_idx] in
                    out =[out , h], 1)

        return out

    def _init_hidden(self, batch_size):
        init_states = []
        for i in range(self.num_layers):
        return init_states

    def _check_kernel_size_consistency(kernel_size):
        if not (isinstance(kernel_size, tuple) or
                    (isinstance(kernel_size, list) and all([isinstance(elem, tuple) for elem in kernel_size]))):
            raise ValueError('`kernel_size` must be tuple or list of tuples')

    def _extend_for_multilayer(param, num_layers):
        if not isinstance(param, list):
            param = [param] * num_layers
        return param