Hi, I’m wondering how can I accumulating gradients in PyTorch. My code is working based on the number of steps as you can see below.
My code is:
def fit(self):
cfg = self.cfg
refiner = nn.DataParallel(self.refiner, device_ids=range(cfg.num_gpu))
self.mean_l1 = 0.
learning_rate = cfg.lr
while True:
for inputs in self.train_loader:
self.refiner.train()
if cfg.scale > 0:
scale = cfg.scale
hr, lr = inputs[-1][0], inputs[-1][1]
else:
scale = random.randint(2, 4)
hr, lr = inputs[scale - 2][0], inputs[scale - 2][1]
hr = hr.to(self.device)
lr = lr.to(self.device)
sr = refiner(lr, scale)
loss = self.loss_fn(sr, hr)
self.optim.zero_grad()
loss.backward()
nn.utils.clip_grad_norm_(self.refiner.parameters(), cfg.clip)
self.optim.step()
self.mean_l1 += loss
learning_rate = self.decay_learning_rate()
for param_group in self.optim.param_groups:
param_group["lr"] = learning_rate
self.step += 1
sys.stdout.write("\r==>>Steps:[%d/%d] Total:[%.6f] "
% (self.step, cfg.max_steps, loss.item()))
self.writer.add_scalar('Loss', loss.data.cpu().numpy(), global_step=self.step)
if cfg.verbose and self.step % cfg.print_interval == 0:
with open('logs/{}/'.format(self.folder_name) + 'logs.txt', 'a') as f:
PATH = os.path.join('logs/{}/checkpoints/'.format(self.folder_name),
"{}_{}.pth.tar".format(cfg.ckpt_name, self.step))
t1 = times.time()
if cfg.scale > 0:
psnr, ssim = self.evaluate(cfg.valid_data_path, scale=cfg.scale, num_step=self.step)
t2 = times.time()
self.writer.add_scalar("PSNR", psnr, self.step)
print('--meanPsnr: {:.5f} --meanSsim: {:.5f} --meanTloss: {:.5f} --> {:.2f}m'
.format(psnr, ssim, ((self.mean_l1) / cfg.print_interval), ((t2 - t1)/60)))
is_best = psnr > self.best_psnr
self.best_psnr = max(psnr, self.best_psnr)
if is_best:
torch.save({'step': self.step, 'model_state_dict': self.refiner.state_dict(),
'optimizer_state_dict': self.optim.state_dict()}, PATH)
f.write('Step: {} -- PSNR x{}: {:.5f} -- Total_Loss: {:.5f}\n'
.format(self.step, cfg.scale, psnr, self.mean_l1 / cfg.print_interval))
else:
psnr = [self.evaluate(cfg.valid_data_path, scale=i, num_step=self.step) for i in range(2, 5)]
t2 = times.time()
self.writer.add_scalar("PSNR_2x", psnr[0][0], self.step)
self.writer.add_scalar("PSNR_3x", psnr[1][0], self.step)
self.writer.add_scalar("PSNR_4x", psnr[2][0], self.step)
self.writer.add_scalar("SSIM_2x", psnr[0][1], self.step)
self.writer.add_scalar("SSIM_3x", psnr[1][1], self.step)
self.writer.add_scalar("SSIM_4x", psnr[2][1], self.step)
print('-mP_x2:{:.5f} -mS_x2:{:.5f} -mP_x3:{:.5f} -mS_x3:{:.5f}'
'-mP_x4:{:.5f} -mS_x4:{:.5f} -mL:{:.5f} -->{:.2f}m'
.format(psnr[0][0], psnr[0][1], psnr[1][0], psnr[1][1], psnr[2][0], psnr[2][1],
(self.mean_l1 / cfg.print_interval), ((t2 - t1)/60)))
is_best = psnr[2][0] > self.best_psnr
self.best_psnr = max(psnr[2][0], self.best_psnr)
if is_best:
torch.save({'step': self.step, 'model_state_dict': self.refiner.state_dict(),
'optimizer_state_dict': self.optim.state_dict()}, PATH)
f.write('Step:{} -- PSNR [X4:{:.5f}, X3:{:.5f}, X2:{:.5f}] -- Total_Loss: {:.5f}\n'
.format(self.step, psnr[2][0], psnr[1][0], psnr[0][0], self.mean_l1 / cfg.print_interval))
self.mean_l1 = 0.
if self.step > cfg.max_steps: break
inputs shape is as follow [batch_size, channels, height, width]. in this case, I can only train my model with batch_size 32 in one GPU and I want to implement accumulating gradients to be able to train network with bigger batch size like 64. any idea how can I do that?
Thanks in advance!