Value Error : can't optimize a non-leaf Tensor

:question: Pytorch Doesn’t Support Variational Dropout in LSTM cell. So I have borrowed one from fastAI implementation. But there is Slight Glitch.

class WeightDropout(Module):
    "A module that warps another layer in which some weights will be replaced by 0 during training."

    def __init__(self, module:nn.Module, weight_p:float, layer_names:Collection[str]=['weight_hh_l0']):
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        self.idxs = [] if hasattr(self.module, '_flat_weights_names') else None
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)
            if self.idxs is not None: self.idxs.append(self.module._flat_weights_names.index(layer))
        if isinstance(self.module, (nn.RNNBase, nn.modules.rnn.RNNBase)):
            self.module.flatten_parameters = self._do_nothing

    def _setweights(self):
        "Apply dropout to the raw weights."
        for i,layer in enumerate(self.layer_names):
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)
            if self.idxs is not None: self.module._flat_weights[self.idxs[i]] = self.module._parameters[layer]

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)

then one should use it like

module = nn.LSTM(5, 2)
dp_module = WeightDropout(module, 0.4)

It works fine when I print its parameters . but when I pass parameters to Adam optimizer

optimizer = torch.optim.Adam(dp_module.parameters() , lr = 1e-3)

It gives error as

ValueError: can’t optimize a non-leaf Tensor . Can someone tell me why and what Should I do

I don’t get this error using this code snippet:

class WeightDropout(nn.Module):
    "A module that warps another layer in which some weights will be replaced by 0 during training."

    def __init__(self, module:nn.Module, weight_p:float, layer_names):
        super(WeightDropout, self).__init__()
        self.module,self.weight_p,self.layer_names = module,weight_p,layer_names
        self.idxs = [] if hasattr(self.module, '_flat_weights_names') else None
        for layer in self.layer_names:
            #Makes a copy of the weights of the selected layers.
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
            self.module._parameters[layer] = F.dropout(w, p=self.weight_p, training=False)
            if self.idxs is not None: self.idxs.append(self.module._flat_weights_names.index(layer))
        if isinstance(self.module, (nn.RNNBase, nn.modules.rnn.RNNBase)):
            self.module.flatten_parameters = self._do_nothing

    def _setweights(self):
        "Apply dropout to the raw weights."
        for i,layer in enumerate(self.layer_names):
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=self.training)
            if self.idxs is not None: self.module._flat_weights[self.idxs[i]] = self.module._parameters[layer]

    def forward(self, *args):
        self._setweights()
        with warnings.catch_warnings():
            #To avoid the warning that comes because the weights aren't flattened.
            warnings.simplefilter("ignore")
            return self.module.forward(*args)
        

    def reset(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=False)
        if hasattr(self.module, 'reset'): self.module.reset()    
    
    def _do_nothing(self): pass

module = nn.LSTM(5, 2)
dp_module = WeightDropout(module, 0.4, ['weight_hh_l0'])

optimizer = torch.optim.Adam(dp_module.parameters() , lr = 1e-3)
x = torch.randn(1, 1, 5)
out = dp_module(x)
out[0].mean().backward()

optimizer.step()

Could you check your code again or post an executable code snippet to reproduce this error?

1 Like

here is full exectuable . FastAI implementation implements weightdrop which is different from variational dropout. weightdrop arbtrarily drops elements in weight matrix. But we want in variational dropout to randomly drop rows of weight matrix. So I modified it and also highlighted the difference I made to FastAI implementation code

class VariationalLSTM(nn.Module):

"""
Modified by Code taken at  https://github.com/fastai/fastai/blob/master/fastai/text/models/awd_lstm.py#L27.
FastAI implementation implements WeightDropLSTM which is different from VariationaLSTM.
Former drops certain elements in matrix that defines recurrent connection in each layer
Variational Dropout(VDropout) is different in following.Hence those changes are made


i)VDropout Should zero out rows of matrices, instead of arbitrarily zeroing it out 
ii)VDropout also zeros matrices that involves Output connection 

"""    


def __init__(self,module,weight_p=0.0, layer_names=['weight_hh_l0']):
    
    super().__init__()
    
    self.module  = module
    self.weight_p,self.layer_names = weight_p,layer_names
    self.idxs = [] if hasattr(self.module, '_flat_weights_names') else None
    
    
    
    for layer in self.layer_names:
    
    
        w = getattr(self.module, layer)
        self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
        
            """this is the part that is different from weight drop """
        N,K  = raw_w.shape
        mask = F.dropout(torch.ones(N,1),p=self.weight_p,training=False)
        mask = mask.repeat(1,K)
        self.module._parameters[layer] = raw_w * mask
        
        if self.idxs is not None: self.idxs.append(self.module._flat_weights_names.index(layer))
    if isinstance(self.module, (nn.RNNBase, nn.modules.rnn.RNNBase)):
        self.module.flatten_parameters = self._do_nothing

def _setweights(self):
    for i,layer in enumerate(self.layer_names):
        raw_w = getattr(self, f'{layer}_raw')
        

        """this is the part that is different from weight drop """
        N,K  = raw_w.shape
        mask = F.dropout(torch.ones(N,1),p=self.weight_p,training=False)
        mask = mask.repeat(1,K)
        self.module._parameters[layer] = raw_w * mask



        self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=True)
        if self.idxs is not None: self.module._flat_weights[self.idxs[i]] = self.module._parameters[layer]

def forward(self, *args, do_dropout=True, **kwargs):
    
    if(do_dropout):
        self._setweights()
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        return self.module.forward(*args)
    
def reset(self):
    for layer in self.layer_names:
        raw_w = getattr(self, f'{layer}_raw')
        self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=False)
    if hasattr(self.module, 'reset'): self.module.reset()    

def _do_nothing(self): pass

and when I run the following code

module = nn.LSTM(1,20)
m = VariationalLSTM(module , weight_p=0.25)
print(m)
optimizer = torch.optim.Adam(m.parameters() , lr = 1e-3)

error is following

VariationalLSTM(
  (module): LSTM(1, 20)
)

---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-10-07433498e4d2> in <module>
      2 m = VariationalLSTM(module , weight_p=0.25)
      3 print(m)
----> 4 optimizer = torch.optim.Adam(m.parameters() , lr = 1e-3)

/usr/local/lib/python3.6/dist-packages/torch/optim/adam.py in __init__(self, params, lr, betas, eps, weight_decay, amsgrad)
     40         defaults = dict(lr=lr, betas=betas, eps=eps,
     41                         weight_decay=weight_decay, amsgrad=amsgrad)
---> 42         super(Adam, self).__init__(params, defaults)
     43 
     44     def __setstate__(self, state):

/usr/local/lib/python3.6/dist-packages/torch/optim/optimizer.py in __init__(self, params, defaults)
     49 
     50         for param_group in param_groups:
---> 51             self.add_param_group(param_group)
     52 
     53     def __getstate__(self):

/usr/local/lib/python3.6/dist-packages/torch/optim/optimizer.py in add_param_group(self, param_group)
    200                                 "but one of the params is " + torch.typename(param))
    201             if not param.is_leaf:
--> 202                 raise ValueError("can't optimize a non-leaf Tensor")
    203 
    204         for name, default in self.defaults.items():


ValueError: can't optimize a non-leaf Tensor

Here is the full along with the imports

import torch
import torch.nn as nn
import torch.nn.functional as F

class VariationalLSTM(nn.Module):
    
    
    
    """
    Modified by Code taken at  https://github.com/fastai/fastai/blob/master/fastai/text/models/awd_lstm.py#L27.
    FastAI implementation implements WeightDropLSTM which is different from VariationaLSTM.
    Former drops certain elements in matrix that defines recurrent connection in each layer
    Variational Dropout(VDropout) is different in following.Hence those changes are made
    
    
    i)VDropout Should zero out rows of matrices, instead of arbitrarily zeroing it out 
    ii)VDropout also zeros matrices that involves Output connection 
    
    """    


    def __init__(self,module,weight_p=0.0, layer_names=['weight_hh_l0']):
        
        super().__init__()
        
        self.module  = module
        self.weight_p,self.layer_names = weight_p,layer_names
        self.idxs = [] if hasattr(self.module, '_flat_weights_names') else None
        
        
        
        for layer in self.layer_names:
        
        
            w = getattr(self.module, layer)
            self.register_parameter(f'{layer}_raw', nn.Parameter(w.data))
              """
              following is the modification I made
             """   
            
            N,K  = w.shape
            mask = F.dropout(torch.ones(N,1),p=self.weight_p,training=False)
            mask = mask.repeat(1,K)
            self.module._parameters[layer] = w * mask
            
            if self.idxs is not None: self.idxs.append(self.module._flat_weights_names.index(layer))
        if isinstance(self.module, (nn.RNNBase, nn.modules.rnn.RNNBase)):
            self.module.flatten_parameters = self._do_nothing

    def _setweights(self):
        for i,layer in enumerate(self.layer_names):
            raw_w = getattr(self, f'{layer}_raw')


           """
              following is the modification I made
          """   
            N,K  = raw_w.shape
            mask = F.dropout(torch.ones(N,1),p=self.weight_p)
            mask = mask.repeat(1,K)
            self.module._parameters[layer] = raw_w * mask
            
            if self.idxs is not None: self.module._flat_weights[self.idxs[i]] = self.module._parameters[layer]

    def forward(self, *args, do_dropout=True, **kwargs):
        
        if(do_dropout):
            self._setweights()
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            return self.module.forward(*args)
        
    def reset(self):
        for layer in self.layer_names:
            raw_w = getattr(self, f'{layer}_raw')
            self.module._parameters[layer] = F.dropout(raw_w, p=self.weight_p, training=False)
        if hasattr(self.module, 'reset'): self.module.reset()    
    
    def _do_nothing(self): pass

module = nn.LSTM(1,20)
m = VariationalLSTM(module , weight_p=0.25)
print(m)
optimizer = torch.optim.Adam(m.parameters() , lr = 1e-3)

I think error is caused by modifications I made to the code. take a look at the code that I have posted above