Hi everyone!
I’m new with gradient accumulation, so should I make zero_grad() on the start of each epoch?
#optimizer.zero_grad() <-----------------Should I make this on the beggining every epoch?
for step in range(total_steps):
indices = self.dataloader_dict[phase].dataset.get_train_indices()
new_sampler = data.sampler.SubsetRandomSampler(indices=indices)
self.dataloader_dict[phase].batch_sampler.sampler = new_sampler
try:
images, captions = next(iter(self.dataloader_dict[phase]))
except:
batches_skiped+=1
continue
images = images.to(self.device)
captions = captions.to(self.device)
with torch.set_grad_enabled(phase == 'train'):
features = encoder(images)
features = features.to(self.device)
outputs = decoder(features, captions)
loss = self.criterion(outputs.view(-1, vocab_size), captions.view(-1))
if phase == 'train':
loss.backward()
torch.nn.utils.clip_grad_norm_(decoder.parameters(), 1.0)
if (step+1)%self.grad_acumulation_step == 0:
self.optimizer.step()
self.optimizer.zero_grad()
# writting weights and grads to tensorboard's histogram
for name, weight in decoder.named_parameters():
self.tb.add_histogram(name, weight, step)
#self.tb.add_histogram(f'{name}.grad', weight.grad, step)
elif step%(total_steps//5)==0:
examples = self.add_examples(captions, outputs, phase)
self.tb.add_text(f'{phase}:ground_truth/predictions', examples, step)
running_loss += loss.item() * features.size(0)
bleu4 = self.compute_metric(captions, outputs)
running_bleu+=bleu4