How to compute gradient for conditionals?

I want to create mask from score threshold and multiply it with weight matrix.
To create mask I’m using condition shown in code. but for all score parameters grad are zero.
How to compute gradient for score parameters ?

class SuperMaskMLP(torch.nn.Module):
  
    def __init__(self,num_features,hidden_nodes_list,num_classes):
        super(SuperMaskMLP, self).__init__()
        
        num_hidden_layes = len(hidden_nodes_list)
        self.hidden = torch.nn.ModuleList()
        self.score = torch.nn.ModuleList()

        if num_hidden_layes==0:
          self.hidden.append(torch.nn.Linear(num_features, num_classes))
          self.score.append(torch.nn.Linear(num_features, num_classes))
        else:
          self.hidden.append(torch.nn.Linear(num_features, hidden_nodes_list[0]))
          self.score.append(torch.nn.Linear(num_features, hidden_nodes_list[0]))
          for k in range(num_hidden_layes-1):
              self.hidden.append(torch.nn.Linear(hidden_nodes_list[k], hidden_nodes_list[k+1]))
              self.score.append(torch.nn.Linear(hidden_nodes_list[k], hidden_nodes_list[k+1]))      
          self.hidden.append(torch.nn.Linear(hidden_nodes_list[num_hidden_layes-1], num_classes))
          self.score.append(torch.nn.Linear(hidden_nodes_list[num_hidden_layes-1], num_classes))

        
    # input : features
    # output: logits , probabilities
    def forward(self, x):
        out = x
        i=0
        for layer in self.hidden[:-1]:
          wmask = self.score[i].weight.clone()
          wmask[self.score[i].weight >= (self.score[i].weight.mean())] = 1
          wmask[self.score[i].weight < (self.score[i].weight.mean())] = 0
          bmask = self.score[i].bias.clone()
          bmask[self.score[i].bias >= (self.score[i].bias.mean())] = 1
          bmask[self.score[i].bias < (self.score[i].bias.mean())] = 0 
          w = layer.weight * wmask
          b = layer.bias * bmask
          out = torch.matmul(out,torch.t(w)) + (b)
          out = F.relu(out)
          i+=1

        wmask = self.score[i].weight.clone()
        wmask[self.score[i].weight >= (self.score[i].weight.mean())] = 1
        wmask[self.score[i].weight < (self.score[i].weight.mean())] = 0
        bmask = self.score[i].bias.clone()
        bmask[self.score[i].bias >= (self.score[i].bias.mean())] = 1
        bmask[self.score[i].bias < (self.score[i].bias.mean())] = 0 
        w = self.hidden[-1].weight *wmask
        b = self.hidden[-1].bias * bmask
        logits = torch.matmul(out,torch.t(w)) + (b)
        probas = F.log_softmax(logits, dim=1)
        return 1,logits, probas

Hi,

The derivative tells you how much the output will change for a very small change in the input.
If you use your weights in a condition like that. Changing them a little won’t change the output at all. So all the gradient will be 0.
So all zeros are the right gradients here I’m afraid.