Custom Layer weight is not updating

I have this custom layer made by using the Linear class:

class LinearDropout(nn.Module):
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: torch.Tensor

    def __init__(self, in_features: int, out_features: int,
                 device=None, dtype=None,  inplace: bool = False) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        self.reset_parameters()
        self.inplace = inplace
    
    def reset_parameters(self) -> None:
        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
        # https://github.com/pytorch/pytorch/issues/57109
        torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        
    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}'.format(
            self.in_features, self.out_features
        )
    
    def forward(self, input: torch.Tensor) -> torch.Tensor:
        print(f'Inputs size {input.size()}')
        weight_item = self.weight.item() 
        print(weight_item)
        dropout = F.dropout(input, weight_item, self.training, self.inplace)
        print(f'Dropout size {dropout.size()}')
        print(input.all() == dropout.all())
        return dropout

By doing this the weight doesn’t change through the iteration, and I don’t know why. When I change the return to return input * self.weights the weight change. Which make me believe the problem is due on how I use the weight for the dropout p.

I also tried with an other way:

class LinearWindow(nn.Module):
   
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: torch.Tensor

    def __init__(self, in_features: int, out_features: int,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.empty((out_features, in_features), **factory_kwargs))
        self.reset_parameters()

    def reset_parameters(self) -> None:
        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
        # https://github.com/pytorch/pytorch/issues/57109
        torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        inputs_percentage = int(float(len(input)) * self.weight)
        print(inputs_percentage)
        print(self.weight)
        inputs_wi = input[0:inputs_percentage]
        print(f'Inputs window size {inputs_wi.size()}')

        if len(inputs_wi) < len(input):
            d = int(len(input) - len(inputs_wi))
            d = torch.zeros(d, inputs_wi.size()[1])
            inputs_wi = torch.cat([inputs_wi,d])
        print(f'Window size {inputs_wi.size()}')
        return inputs_wi

    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}'.format(
            self.in_features, self.out_features
        )

But I have the same problem

You are creating a Python scalar via:

weight_item = self.weight.item() 

that Autograd sees as a constant and which is not attached to the trainable self.weight. Remove the .item() call and it should work.

If I remove the .item() and just doing dropout = F.dropout(input, self.weight, self.training, self.inplace)

I have this error:
TypeError: dropout(): argument 'p' (position 2) must be float, not Parameter

Is there a way to make my own dropout layer that works with parameters ?

You are right as the drop probability cannot be trained in F.dropout and needs to be a scalar.

I can force the learning of the weight by doing return dropout * self.weight. But it feel a little off for me.

Edit: I think after consideration it is okay to have this return, as the weight are constraint between 0.0 and 1.0. So it is like multiply the dropout by a percentage, which shouldn’t remove to much of information.

I have a updated custom layer:

class LinearWindow(nn.Module):
   
    __constants__ = ['in_features', 'out_features']
    in_features: int
    out_features: int
    weight: torch.Tensor

    def __init__(self, in_features: int, out_features: int,
                 device=None, dtype=None) -> None:
        factory_kwargs = {'device': device, 'dtype': dtype}
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.weight = Parameter(torch.ones((out_features, in_features), **factory_kwargs))
        self.reset_parameters()

    def reset_parameters(self) -> None:
        # Setting a=sqrt(5) in kaiming_uniform is the same as initializing with
        # uniform(-1/sqrt(in_features), 1/sqrt(in_features)). For details, see
        # https://github.com/pytorch/pytorch/issues/57109
        torch.nn.init.kaiming_uniform_(self.weight, a=math.sqrt(5))
        

    def forward(self, input: torch.Tensor) -> torch.Tensor:
        print(self.weight)
        
        inputs_wi = input[: int(len(input) * self.weight)]
        print(f'Inputs window size {inputs_wi.size()}')
        if len(inputs_wi) < len(input):
            d = int(len(input) - len(inputs_wi))
            d = torch.zeros(d,input.size()[1])
            inputs_wi = torch.cat([inputs_wi,d])
        #print(f'Window size {inputs_wi.size()}')
        return inputs_wi * self.weight

    def extra_repr(self) -> str:
        return 'in_features={}, out_features={}'.format(
            self.in_features, self.out_features
        )

Sometimes, the weight is training, and sometimes, the weight is not training. If the weight is not training, I interrupt and restart the training.

EDIT: The weight is constrained between 0.0 and 1.0. The weight is, sometimes, stuck at 0.0. When the weight doesn’t take the lowest value of the interval, in this case 0.0, it train without problem.