I have my model as described below. While I was trying to check the gradient flow using this pytorch post (Check gradient flow in network) , i discovered that some of my parameters gradients still have NONE value. After little debugging, I got to know that following layers have None value : bert.pooler.dense.weight, bert.pooler.dense.bias .
I am not sure how relevant it is for pooler layers to learn gradients.
This is my model :
class EntityModel(nn.Module):
def __init__(self,std_gaussian=0.1,with_noise_layer = True, dropout_layer=False, dropout_prob = 0.3):
super(EntityModel, self).__init__()
self.std_gaussian = std_gaussian
self.with_noise_layer = with_noise_layer
self.bert = XLMRobertaModel.from_pretrained(config.BASE_MODEL,output_attentions = False, output_hidden_states = False)
self.dropout_layer = dropout_layer
self.dropout_prob = dropout_prob
if self.with_noise_layer:
self.noise = GaussianNoise(stddev=self.std_gaussian)
if self.dropout_layer:
self.bert_drop_1 = nn.Dropout(self.dropout_prob)
self.out_tag = nn.Linear(768, 2)
self.softmax = nn.Softmax(dim=2)
def forward(self, ids, attention_mask):
outputs = self.bert(ids,
attention_mask = attention_mask,
return_dict=False)
if (self.with_noise_layer):
noise = self.noise(outputs[0]) # 256*768
if self.dropout_layer:
bo_tag = self.bert_drop_1(noise)
tag = self.out_tag(bo_tag)
else:
tag = self.out_tag(noise)
else:
if self.dropout_layer:
bo_tag = self.bert_drop_1(outputs[0])
tag = self.out_tag(bo_tag)
else :
tag = self.out_tag(outputs[0])
softmax_prob = self.softmax(tag)
return softmax_prob,tag
Edit: I checked this with a simple XLMRoberta with a linear layer on top but still it gives None gradients for the pooler weights and biases