The model is not learning

I’ve write a custom network but some how it does not learn, the gradients are always zeros.

  • [ ] network: sigunet
class sigunet(nn.Module):

    def __init__(self, model, concate, m, n, kernel_size, pool_size, threshold, device, sequence_length=96):
        super(sigunet, self).__init__()

        self.model = model
        self.concate = concate
        self.m = m
        self.n = n
        self.kernel_size = kernel_size
        self.pool_size = pool_size
        self.threshold = threshold
        self.device = device
        self.loss_function = nn.CrossEntropyLoss(ignore_index=IGNORE_INDEX)
        self.sequence_length = sequence_length
        pass1_len = sequence_length
        pass2_len = self.pool_len(pass1_len, 2, 2)
        pass3_len = self.pool_len(pass2_len, 2, 2)
        self.level_1 = [conv1d(concate.output_size, m, kernel_size).cuda(), conv1d(m, m, kernel_size).cuda(), avg_pool(2)]
        self.level_2 = [conv1d(m, (m + n), kernel_size).cuda(), conv1d((m + n), (m + n), kernel_size).cuda(), avg_pool(2)]
        self.level_3 = [conv1d((m + n), (m + 2 * n), kernel_size).cuda(), conv1d((m + 2 * n), (m + 2 *  n), kernel_size).cuda(), \
                        avg_pool(2)]
        self.delevel_1 = [conv1d((m + 2 * n), (m + 3 * n), kernel_size).cuda(), conv1d((m + 3 * n), (m + 3 * n), kernel_size).cuda(),\
                          deconv1d((m + 3 * n), (m + 2 * n), pass3_len, kernel_size, 2).cuda()]
        self.delevel_2 = [conv1d((2 * m + 4 * n), (m + 2 * n), kernel_size).cuda(), conv1d((m + 2 * n), (m + 2 * n), kernel_size).cuda(),\
                          deconv1d((m + 2 * n), (m + n), pass2_len, kernel_size, 2).cuda()]
        self.delevel_3 = [conv1d((2 * m + 2 * n), (m + n), kernel_size).cuda(), conv1d((m + n), (m + n), kernel_size).cuda(),\
                          deconv1d((m + n), m, pass1_len, kernel_size, 2).cuda()]
        self.finals = [conv1d((2 * m), m, kernel_size).cuda(), conv1d(m, 3, kernel_size, nn.Softmax(dim=1)).cuda()]

    def forward(self, inputs, targets):

        outputs = self.model(inputs)
        indexed_sequences, _ = inputs
        onehot = index2onehot(dim=self.concate.onehot_size, indexed_sequences=indexed_sequences, device=self.device)

        # the front two ouputs is going to be ignored
        # encoded_sources: (batch_size, seq_len, embed_size)
        mlm_outputs, nsp_outputs, encoded_sources = outputs
        # Permute the axis to adapt to nn.Conv1d
        # encoded_sources: (batch_size, embed_size, seq_len)
        # https://discuss.pytorch.org/t/swap-axes-in-pytorch/970/2
        sigunet_input_ = self.concate((encoded_sources, onehot))
        sigunet_input = sigunet_input_.transpose(2, 1)

        out = self.level_1[0](sigunet_input)
        pass1 = self.level_1[1](out)
        out = self.level_1[2](pass1)

        out = self.level_2[0](out)
        pass2 = self.level_2[1](out)
        out = self.level_2[2](pass2)
        
        out = self.level_3[0](out)
        pass3 = self.level_3[1](out)
        out = self.level_3[2](pass3)

        out = self.delevel_1[0](out)
        out = self.delevel_1[1](out)
        out = self.delevel_1[2](out)

        out = torch.cat([out, pass3], dim=1)

        out = self.delevel_2[0](out)
        out = self.delevel_2[1](out)
        out = self.delevel_2[2](out)

        out = torch.cat([out, pass2], dim=1)

        out = self.delevel_3[0](out)
        out = self.delevel_3[1](out)
        out = self.delevel_3[2](out)

        out = torch.cat([out, pass1], dim=1)

        out = self.finals[0](out)
        out = self.finals[1](out)

        _out = out.transpose(2, 1)

        # Make it (batch_size, length, channels)
        #trans_out = out.transpose(2, 1)
        # errorenous
        #out, _ = torch.max(out, 2)
        predictions = self.pass_threshold(_out)

        loss = self.loss_function(_out.reshape(-1, 3), targets.reshape(-1))
        # predictions = self.detect_SignalPeptides(_out)

        return predictions, loss.unsqueeze(dim=0)
  • [ ] trainer
...
for param in self.loss_model.parameters():                                                                                        
    print(param.grad.data.sum())                                                                                                  
...

the self.loss_model mentioned above is sigunet
and the grads are all zeros.

I assume it could be some operations that are not prop-able.

Could you try to use nn.ModuleList instead of Python lists to store your layers?
This will make sure all parameters are returned when calling model.parameters() and passing them to the optimizer.
However, this would not explain the gradients to be all zero, but might be a starter in debugging this issue.

I think I’ve found the problem.
According to: Why "loss.backward()" didn't update parameters' gradient?
Add a batch normalization layer could solve this.