I’m trying to plot the gradient flow of my model. It was working fine before but somehow it’s not working right now. Here are my codes:
Model Class:
class Model(nn.Module):
def __init__(self, args, vocab):
super(Model, self).__init__()
self.args = args
self.vocab = vocab
# input layer
input_size = 0
self.word_emb = nn.Embedding(len(vocab['words']) + 1, self.args['word_emb_dim'])
input_size += self.args['word_emb_dim']
self.pos_emb = nn.Embedding(len(vocab['postags']) + 1, self.args['pos_emb_dim'])
input_size += self.args['pos_emb_dim']
# self.dep_emb = nn.Embedding(len(vocab['deprels']), self.args['dep_emb_dim'])
# recurrent layer
self.GRU = nn.GRU(input_size, self.args['hidden_dim'], self.args['num_layers'],\
batch_first=True, dropout=self.args['dropout'], bidirectional=True)
self.GRU_hidden = nn.Parameter(torch.zeros(self.args['num_layers'] * 2, self.args['batch_size'], self.args['hidden_dim']))
# classifier
self.unlabeled = DeepBiaffine(self.args['hidden_dim'] * 2, self.args['hidden_dim'] * 2, self.args['dba_hidden_dim'], 1, dropout=self.args['dropout'])
self.deprel = DeepBiaffine(self.args['hidden_dim'] * 2, self.args['hidden_dim'] * 2, self.args['dba_hidden_dim'], len(vocab['deprels']), dropout=self.args['dropout'])
# criterion
self.crit = nn.CrossEntropyLoss(ignore_index=-1)
self.dropout = nn.Dropout(self.args['dropout'])
def forward(self, words, word_mask, postags, heads, deprels, sentlens):
words = words.cuda()
word_mask = word_mask.cuda()
postags = postags.cuda()
heads = heads.cuda()
deprels = deprels.cuda()
inputs = []
# pack embedded inputs
embedded_word = self.word_emb(words)
inputs += [pack_padded_sequence(embedded_word, sentlens, batch_first=True, enforce_sorted=False)]
embedded_pos = self.pos_emb(postags)
inputs += [pack_padded_sequence(embedded_pos, sentlens, batch_first=True, enforce_sorted=False)]
# rnn inputs
rnn_inputs = torch.cat([x.data for x in inputs], 1)
rnn_inputs = self.dropout(rnn_inputs)
rnn_inputs = PackedSequence(rnn_inputs, inputs[0].batch_sizes)
rnn_outputs, h = self.GRU(rnn_inputs, self.GRU_hidden)
self.GRU_hidden = nn.Parameter(h, requires_grad=True)
rnn_outputs, _ = pad_packed_sequence(rnn_outputs, batch_first=True)
unlabeled_inputs = rnn_outputs.unsqueeze(len(rnn_outputs.size())-1).expand(rnn_outputs.size(0), rnn_outputs.size(1), rnn_outputs.size(1), rnn_outputs.size(2))
unlabeled_scores = self.unlabeled(self.dropout(unlabeled_inputs), self.dropout(unlabeled_inputs))
deprel_scores = self.deprel(self.dropout(rnn_outputs), self.dropout(rnn_outputs))
unlabeled_scores = unlabeled_scores[:, 1:, :]
unlabeled_mask = word_mask.unsqueeze(1).expand(unlabeled_scores.size(0), unlabeled_scores.size(1), unlabeled_scores.size(2))
unlabeled_scores = unlabeled_scores.contiguous().view(-1, unlabeled_scores.size(2))
unlabeled_mask = unlabeled_mask.reshape(unlabeled_scores.size(0), unlabeled_scores.size(1))
unlabeled_scores = unlabeled_scores.masked_fill_(unlabeled_mask, -float('inf'))
deprel_scores = deprel_scores[:, 1:, :]
deprel_scores = deprel_scores.contiguous().view(-1, len(self.vocab['deprels']))
# heads -= 1
preds = []
if self.training:
unlabeled_target = heads.masked_fill_(word_mask[:, 1:], -1)
deprel_target = deprels.masked_fill_(word_mask[:, 1:], -1)
loss = self.crit(unlabeled_scores, unlabeled_target.view(-1))
loss += self.crit(deprel_scores, deprel_target.view(-1))
# calculate accuracy
unlabeled_preds = F.log_softmax(unlabeled_scores, 1)
unlabeled_preds = unlabeled_preds.argmax(dim=1)
unlabeled_preds = unlabeled_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)
ul_corr = (unlabeled_preds == heads.view(-1))
acc1 = ul_corr.sum().float() / float( heads.view(-1).size(0) )
deprel_preds = F.log_softmax(deprel_scores, 1)
deprel_preds = deprel_scores.argmax(dim=1)
deprel_preds = deprel_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)
dep_corr = (deprel_preds == deprels.view(-1))
acc2 = dep_corr.sum().float() / float( deprels.view(-1).size(0) )
acc = (acc1 + acc2)/2
else:
unlabeled_target = heads.masked_fill_(word_mask[:, 1:], -1)
deprel_target = deprels.masked_fill_(word_mask[:, 1:], -1)
loss = self.crit(unlabeled_scores, unlabeled_target.view(-1))
loss += self.crit(deprel_scores, deprel_target.view(-1))
# calculate accuracy
unlabeled_preds = F.log_softmax(unlabeled_scores, 1)
unlabeled_preds = unlabeled_preds.argmax(dim=1)
unlabeled_preds = unlabeled_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)
ul_corr = (unlabeled_preds == heads.view(-1))
acc1 = ul_corr.sum().float() / float( heads.view(-1).size(0) )
deprel_preds = F.log_softmax(deprel_scores, 1)
deprel_preds = deprel_scores.argmax(dim=1)
deprel_preds = deprel_preds.masked_fill_(word_mask[:, 1:].reshape(unlabeled_preds.size(0)), -1)
dep_corr = (deprel_preds == deprels.view(-1))
acc2 = dep_corr.sum().float() / float( deprels.view(-1).size(0) )
acc = (acc1 + acc2)/2
# predictions
unlabeled_preds = F.log_softmax(unlabeled_scores, 1)
unlabeled_preds = unlabeled_preds.argmax(dim=1)
deprel_preds = F.log_softmax(deprel_scores, 1)
deprel_preds = deprel_scores.argmax(dim=1)
preds.append(unlabeled_preds.detach().cpu().numpy())
preds.append(deprel_preds.detach().cpu().numpy())
return loss, acc, preds
model = Model(args, train_data.vocabs)
model.cuda()
Output1
Model(
(word_emb): Embedding(19224, 100)
(pos_emb): Embedding(17, 100)
(GRU): GRU(200, 128, num_layers=64, batch_first=True, dropout=0.33, bidirectional=True)
(unlabeled): DeepBiaffine(
(MLP1): Linear(in_features=256, out_features=128, bias=True)
(MLP2): Linear(in_features=256, out_features=128, bias=True)
(biaff): Bilinear(in1_features=129, in2_features=129, out_features=1, bias=True)
(drop): Dropout(p=0.33, inplace=False)
)
(deprel): DeepBiaffine(
(MLP1): Linear(in_features=256, out_features=128, bias=True)
(MLP2): Linear(in_features=256, out_features=128, bias=True)
(biaff): Bilinear(in1_features=129, in2_features=129, out_features=31, bias=True)
(drop): Dropout(p=0.33, inplace=False)
)
(crit): CrossEntropyLoss()
(dropout): Dropout(p=0.33, inplace=False)
)
Training function helper
import time
from torch.autograd import Variable
def train_model(args, model, train_data, eval_data=None, num_epochs=1):
optimizer = optim.Adam(model.parameters(), lr=args['learning_rate'])
print('start training...')
start_time = time.time()
for epoch in range(num_epochs):
start_time = time.time()
# train
train_loss = train_acc = 0
with torch.enable_grad():
model.train()
for i, batch in enumerate(train_data):
# model.zero_grad()
words, word_mask, postags, heads, deprels, sentlens = batch
loss, acc, _ = model(words, word_mask, postags, heads, deprels, sentlens)
train_loss += loss.item()
train_acc += acc.item()
loss = Variable(loss, requires_grad=True)
loss.backward()
plot_grad_flow(model.named_parameters())
torch.nn.utils.clip_grad_norm_(model.parameters(), args['clip'])
optimizer.step()
torch.cuda.empty_cache()
elapsed_time = time.time() - start_time
train_loss /= len(train_data)
train_acc /= len(train_data)
# eval
model.eval()
eval_loss = eval_acc = 0
if eval_data is not None:
with torch.no_grad():
for i, batch in enumerate(eval_data):
words, word_mask, postags, heads, deprels, sentlens = batch
loss, acc, _ = model(words, word_mask, postags, heads, deprels, sentlens)
eval_loss += loss.data.cpu().numpy()
eval_acc += acc.data.cpu().numpy()
torch.cuda.empty_cache()
eval_loss /= len(eval_data)
eval_acc /= len(eval_data)
else:
eval_loss = 0
eval_acc = 0
log = '| {}/{} epoch | train_loss:{:.5f} | train_acc:{:2.2f} | eval_loss:{:.5f} | eval_acc:{:2.2f} | time: {:.2f} |'.format(
epoch + 1, num_epochs, train_loss, train_acc * 100, eval_loss, eval_acc * 100, elapsed_time
)
train_data.reshuffle()
eval_data.reshuffle()
print(log)
return
I tried to plot it with this function:
import matplotlib.pyplot as plt
%matplotlib inline
def plot_grad_flow(named_parameters):
ave_grads = []
layers = []
for n, p in named_parameters:
if(p.requires_grad) and ("bias" not in n):
print(n, p.grad)
layers.append(n)
ave_grads.append(p.grad.abs().mean())
plt.plot(ave_grads, alpha=0.3, color="b")
plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
plt.xticks(range(0,len(ave_grads), 1), layers, rotation="vertical")
plt.xlim(xmin=0, xmax=len(ave_grads))
plt.xlabel("Layers")
plt.ylabel("average gradient")
plt.title("Gradient flow")
plt.grid(True)
plt.show()
Here is the output:
start training...
GRU_hidden None
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-16-8432d94514b3> in <module>()
----> 1 train_model(args, model, train_data, num_epochs=3)
1 frames
<ipython-input-15-ba3b575b824f> in plot_grad_flow(named_parameters)
8 print(n, p.grad)
9 layers.append(n)
---> 10 ave_grads.append(p.grad.abs().mean())
11 plt.plot(ave_grads, alpha=0.3, color="b")
12 plt.hlines(0, 0, len(ave_grads)+1, linewidth=1, color="k" )
AttributeError: 'NoneType' object has no attribute 'abs'
It’s weird because I called enable_grad() beforehand. Please help me point out where did I do wrong. Thank you!