I have been trying to fix the above error for couple of days, scrounging the Pytorch forum as well but unable to do so. I have been getting this error despite my model being in train mode. The model runs for 3 Epochs and then crashes with the above error. Attaching the relevant code snippets and the traceback error.
Code Snippet
def train_epoch(self, epoch):
self.actor.train()
self.critic.train()
self.train_data.shuffle()
total_reward, report_reward = 0, 0
total_critic_loss, report_critic_loss = 0, 0
total_sents, report_sents = 0, 0
total_words, report_words = 0, 0
total_requests = 0
for i in range(len(self.train_data)):
batch = self.train_data[i]
sources = batch[0]
targets = batch[1]
batch_size = targets.size(1)
log_condition = (i % self.opt.log_interval == 0)
self.actor.zero_grad()
self.critic.zero_grad()
# Sample translations
attention_mask = sources[0].data.eq(lib.Constants.PAD).t()
self.actor.decoder.attn.applyMask(attention_mask)
samples, outputs, _, _, _, _ = self.actor.sample(batch, self.max_length, return_prob=False)
# Calculate rewards
rewards, samples = self.sent_reward_func(samples.t().tolist(),
targets.data.t().tolist(),
phraselevel=False, return_samples=False)
reward = sum(rewards)
samples = (torch.cuda.LongTensor(samples).t().contiguous()).cuda()
rewards = (torch.cuda.FloatTensor([rewards] * samples.size(0)).contiguous()).cuda()
# Update critic.
assert(self.update)
to_actor, stats = self.update_critic(sources, samples, rewards)
num_words, critic_loss = stats
total_requests += 1
# Update actor
actor_weights = self.update_actor((outputs, samples),
(rewards, *to_actor), False)
# Gather stats
total_reward += reward
report_reward += reward
total_sents += batch_size
report_sents += batch_size
total_critic_loss += critic_loss
report_critic_loss += critic_loss
total_words += num_words
report_words += num_words
if log_condition and i > 0:
self.log("""Epoch %3d, %6d/%d batches;
actor reward: %.4f; critic loss: %f; %s elapsed""" %
(epoch, i+1, len(self.train_data),
(report_reward / report_sents) * 100,
report_critic_loss / report_words,
str(datetime.timedelta(seconds=int(time.time() - self.start_time)))))
report_reward = report_sents = report_critic_loss = report_words = 0
self.logValidInfo(None, total_requests, total_sents, self.eval_data)
return total_reward / total_sents, total_critic_loss / total_words, total_requests, total_sents
Traceback Error
Traceback (most recent call last):
File "../train.py", line 255, in <module>
main()
File "../train.py", line 244, in main
reinforce_trainer.train(opt.start_reinforce, opt.end_epoch, start_time)
File "/srv/home/ahuja/BIPNMT_experiments/lib/train/ReinforceTrainer.py", line 70, in train
self.train_epoch(epoch)
File "/srv/home/ahuja/BIPNMT_experiments/lib/train/ReinforceTrainer.py", line 238, in train_epoch
(rewards, *to_actor), False)
File "/srv/home/ahuja/BIPNMT_experiments/lib/train/ReinforceTrainer.py", line 328, in update_actor
regression=False)
File "/srv/home/ahuja/BIPNMT_experiments/lib/model/EncoderDecoder.py", line 147, in backward
loss = self.generator.backward(outputs, targets, weights, normalizer, criterion, regression=regression)
File "/srv/home/ahuja/BIPNMT_experiments/lib/model/Generator.py", line 19, in backward
loss.div(normalizer).backward()
File "/srv/home/ahuja/anaconda3/envs/work/lib/python3.7/site-packages/torch/tensor.py", line 221, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/srv/home/ahuja/anaconda3/envs/work/lib/python3.7/site-packages/torch/autograd/__init__.py", line 132, in backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: cudnn RNN backward can only be called in training mode
Please feel free to ask to put up any more code snippet required to analyze the issue. Thanks a lot!