Nn Parameter grad returns none although requires_grad = True

I’m trying to plot the gradient flow of my model. It was working fine before but somehow it’s not working right now. Here are my codes:

Model Class:

 class Model(nn.Module):
  def __init__(self, args, vocab):
    super(Model, self).__init__()
    self.args = args
    self.vocab = vocab
    # input layer
    input_size = 0
    self.word_emb = nn.Embedding(len(vocab['words']) + 1, self.args['word_emb_dim'])
    input_size += self.args['word_emb_dim']

    self.pos_emb = nn.Embedding(len(vocab['postags']) + 1, self.args['pos_emb_dim'])

    input_size += self.args['pos_emb_dim']

    # self.dep_emb = nn.Embedding(len(vocab['deprels']), self.args['dep_emb_dim'])

    # recurrent layer

    self.GRU = nn.GRU(input_size, self.args['hidden_dim'], self.args['num_layers'],\

                            batch_first=True, dropout=self.args['dropout'], bidirectional=True)

    self.GRU_hidden = nn.Parameter(torch.zeros(self.args['num_layers'] * 2, self.args['batch_size'], self.args['hidden_dim']))

    # classifier

    self.unlabeled = DeepBiaffine(self.args['hidden_dim'] * 2, self.args['hidden_dim'] * 2, self.args['dba_hidden_dim'], 1, dropout=self.args['dropout'])

    self.deprel = DeepBiaffine(self.args['hidden_dim'] * 2, self.args['hidden_dim'] * 2, self.args['dba_hidden_dim'], len(vocab['deprels']), dropout=self.args['dropout'])

    # criterion

    self.crit = nn.CrossEntropyLoss(ignore_index=-1)

    

    self.dropout = nn.Dropout(self.args['dropout'])

  def forward(self, words, word_mask, postags, heads, deprels, sentlens):

    words = words.cuda()

    word_mask = word_mask.cuda()

    postags = postags.cuda()

    heads = heads.cuda()

    deprels = deprels.cuda()

    inputs = []

    # pack embedded inputs

    embedded_word = self.word_emb(words)

    inputs += [pack_padded_sequence(embedded_word, sentlens, batch_first=True, enforce_sorted=False)]

    embedded_pos = self.pos_emb(postags)

    inputs += [pack_padded_sequence(embedded_pos, sentlens, batch_first=True, enforce_sorted=False)]

    # rnn inputs

    rnn_inputs = torch.cat([x.data for x in inputs], 1)

    rnn_inputs = self.dropout(rnn_inputs)

    rnn_inputs = PackedSequence(rnn_inputs, inputs[0].batch_sizes)

    rnn_outputs, h = self.GRU(rnn_inputs, self.GRU_hidden)

    self.GRU_hidden = nn.Parameter(h, requires_grad=True)

    rnn_outputs, _ = pad_packed_sequence(rnn_outputs, batch_first=True)

    unlabeled_inputs = rnn_outputs.unsqueeze(len(rnn_outputs.size())-1).expand(rnn_outputs.size(0), rnn_outputs.size(1), rnn_outputs.size(1), rnn_outputs.size(2))

    unlabeled_scores = self.unlabeled(self.dropout(unlabeled_inputs), self.dropout(unlabeled_inputs))

    deprel_scores = self.deprel(self.dropout(rnn_outputs), self.dropout(rnn_outputs))

    

    unlabeled_scores = unlabeled_scores[:, 1:, :]

    unlabeled_mask = word_mask.unsqueeze(1).expand(unlabeled_scores.size(0), unlabeled_scores.size(1), unlabeled_scores.size(2))

    unlabeled_scores = unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2))

    unlabeled_mask = unlabeled_mask.reshape(unlabeled_scores.size(0), unlabeled_scores.size(1))

    unlabeled_scores = unlabeled_scores.masked_fill_(unlabeled_mask, -float('inf'))

    deprel_scores = deprel_scores[:, 1:, :]

    deprel_scores = deprel_scores.contiguous().view(-1, len(self.vocab['deprels']))

    # heads -= 1

    preds = []

    if self.training:

      unlabeled_target = heads.masked_fill_(word_mask[:, 1:], -1)

      deprel_target = deprels.masked_fill_(word_mask[:, 1:], -1)

      loss = self.crit(unlabeled_scores, unlabeled_target.view(-1))

      loss += self.crit(deprel_scores, deprel_target.view(-1))

      

      # calculate accuracy

      unlabeled_preds = F.log_softmax(unlabeled_scores, 1)

      unlabeled_preds = unlabeled_preds.argmax(dim=1)

      unlabeled_preds = unlabeled_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)

      ul_corr = (unlabeled_preds == heads.view(-1))

      acc1 = ul_corr.sum().float() / float( heads.view(-1).size(0) )

      deprel_preds = F.log_softmax(deprel_scores, 1)

      deprel_preds = deprel_scores.argmax(dim=1)

      deprel_preds = deprel_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)

      dep_corr = (deprel_preds == deprels.view(-1))

      acc2 = dep_corr.sum().float() / float( deprels.view(-1).size(0) )

      acc = (acc1 + acc2)/2

    else:

      unlabeled_target = heads.masked_fill_(word_mask[:, 1:], -1)

      deprel_target = deprels.masked_fill_(word_mask[:, 1:], -1)

      loss = self.crit(unlabeled_scores, unlabeled_target.view(-1))

      loss += self.crit(deprel_scores, deprel_target.view(-1))

      

      # calculate accuracy

      unlabeled_preds = F.log_softmax(unlabeled_scores, 1)

      unlabeled_preds = unlabeled_preds.argmax(dim=1)

      unlabeled_preds = unlabeled_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)

      ul_corr = (unlabeled_preds == heads.view(-1))

      acc1 = ul_corr.sum().float() / float( heads.view(-1).size(0) )

      deprel_preds = F.log_softmax(deprel_scores, 1)

      deprel_preds = deprel_scores.argmax(dim=1)

      deprel_preds = deprel_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)

      dep_corr = (deprel_preds == deprels.view(-1))

      acc2 = dep_corr.sum().float() / float( deprels.view(-1).size(0) )

      acc = (acc1 + acc2)/2

      # predictions

      unlabeled_preds = F.log_softmax(unlabeled_scores, 1)

      unlabeled_preds = unlabeled_preds.argmax(dim=1)

      deprel_preds = F.log_softmax(deprel_scores, 1)

      deprel_preds = deprel_scores.argmax(dim=1)

      preds.append(unlabeled_preds.detach().cpu().numpy())

      preds.append(deprel_preds.detach().cpu().numpy())

    return loss, acc, preds
model = Model(args, train_data.vocabs)
model.cuda()

Output1

Model(
  (word_emb): Embedding(19224, 100)
  (pos_emb): Embedding(17, 100)
  (GRU): GRU(200, 128, num_layers=64, batch_first=True, dropout=0.33, bidirectional=True)
  (unlabeled): DeepBiaffine(
    (MLP1): Linear(in_features=256, out_features=128, bias=True)
    (MLP2): Linear(in_features=256, out_features=128, bias=True)
    (biaff): Bilinear(in1_features=129, in2_features=129, out_features=1, bias=True)
    (drop): Dropout(p=0.33, inplace=False)
  )
  (deprel): DeepBiaffine(
    (MLP1): Linear(in_features=256, out_features=128, bias=True)
    (MLP2): Linear(in_features=256, out_features=128, bias=True)
    (biaff): Bilinear(in1_features=129, in2_features=129, out_features=31, bias=True)
    (drop): Dropout(p=0.33, inplace=False)
  )
  (crit): CrossEntropyLoss()
  (dropout): Dropout(p=0.33, inplace=False)
)

Training function helper

import time
from torch.autograd import Variable
def train_model(args, model, train_data, eval_data=None, num_epochs=1):
  optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'])
  print('start training...')
  start_time = time.time()
  for epoch in range(num_epochs):
    start_time = time.time()
    # train
    train_loss = train_acc = 0
    with torch.enable_grad():
      model.train()

      for i, batch in enumerate(train_data):
        # model.zero_grad()

        words, word_mask, postags, heads, deprels, sentlens = batch
        loss, acc, _ = model(words, word_mask, postags, heads, deprels, sentlens)
        train_loss += loss.item()
        train_acc += acc.item()
        loss = Variable(loss, requires_grad=True)
        
        loss.backward()
        plot_grad_flow(model.named_parameters())

        torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip'])
        optimizer.step()
        torch.cuda.empty_cache()

    elapsed_time = time.time() - start_time
    train_loss /= len(train_data)
    train_acc /= len(train_data)

    # eval
    model.eval()
    eval_loss = eval_acc = 0
    if eval_data is not None:
      with torch.no_grad():
        for i, batch in enumerate(eval_data):
          
          words, word_mask, postags, heads, deprels, sentlens = batch
          loss, acc, _ = model(words, word_mask, postags, heads, deprels, sentlens)
          eval_loss += loss.data.cpu().numpy()
          eval_acc += acc.data.cpu().numpy()

          torch.cuda.empty_cache()

        eval_loss /= len(eval_data)
        eval_acc /= len(eval_data)
    else:
      eval_loss = 0
      eval_acc = 0
    log = '|  {}/{} epoch  |  train_loss:{:.5f} | train_acc:{:2.2f} |  eval_loss:{:.5f} | eval_acc:{:2.2f} | time: {:.2f}  |'.format(
        epoch + 1, num_epochs, train_loss, train_acc * 100, eval_loss, eval_acc * 100, elapsed_time
    )
    train_data.reshuffle()
    eval_data.reshuffle()
    print(log)
  return

I tried to plot it with this function:

import matplotlib.pyplot as plt
%matplotlib inline
def plot_grad_flow(named_parameters):
    ave_grads = []
    layers = []
    for n, p in named_parameters:
        if(p.requires_grad) and ("bias" not in n):
            print(n, p.grad)
            layers.append(n)
            ave_grads.append(p.grad.abs().mean())
    plt.plot(ave_grads, alpha=0.3, color="b")
    plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
    plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
    plt.xlim(xmin=0, xmax=len(ave_grads))
    plt.xlabel("Layers")
    plt.ylabel("average gradient")
    plt.title("Gradient flow")
    plt.grid(True)
    plt.show()

Here is the output:

start training...
GRU_hidden None
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-16-8432d94514b3> in <module>()
----> 1 train_model(args, model, train_data, num_epochs=3)

1 frames
<ipython-input-15-ba3b575b824f> in plot_grad_flow(named_parameters)
      8             print(n, p.grad)
      9             layers.append(n)
---> 10             ave_grads.append(p.grad.abs().mean())
     11     plt.plot(ave_grads, alpha=0.3, color="b")
     12     plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )

AttributeError: 'NoneType' object has no attribute 'abs'

It’s weird because I called enable_grad() beforehand. Please help me point out where did I do wrong. Thank you!

You are breaking the computation graph by recreating the loss tensor here:

loss = Variable(loss, requires_grad=True)

Remove this line of code and also remove the usage of Variable, as it’s deprecated since PyTorch 0.4 :wink: