Thanks for replying. This is on GPU.
There are 4 parts to the model - frontend, classification, regression, regularizers; and corresponding optimizers. Error should be due to the scheduler, because nan
value occurs on decreasing the learning rate during an epoch
Training code
scaler = torch.cuda.amp.GradScaler()
for e in range(self.epoch_beg, self.epoch):
self.model.train()
iterator = iter(dataloader)
with trange(1, self.bpe + 1) as pbar:
for bidx in pbar:
pbar.set_description("Epoch {}/{}".format(e, self.epoch))
try:
batch = next(iterator)
except StopIteration:
iterator = iter(dataloader)
batch = next(iterator)
for k in batch.keys():
batch[k] = batch[k].cuda()
self.frontend_optim.zero_grad()
for worker in self.cl:
self.cls_optim[worker.name].zero_grad()
for worker in self.re:
self.regr_optim[worker.name].zero_grad()
tot_loss = 0
losses = {}
with autocast():
h, chunk, preds, labels = self.model.forward(batch, self.alphaSG, device)
label = labels
for worker in self.cl:
loss = worker.loss_weight * worker.loss(preds[worker.name], label[worker.name])
losses[worker.name] = loss
tot_loss += loss
for worker in self.re:
loss = worker.loss_weight * worker.loss(preds[worker.name], label[worker.name])
losses[worker.name] = loss
tot_loss += loss
for worker in self.reg:
loss = worker.loss_weight * worker.loss(preds[worker.name], label[worker.name])
losses[worker.name] = loss
tot_loss += loss
scaler.scale(tot_loss).backward()
for _, optim in self.cls_optim.items():
scaler.step(optim)
for _, optim in self.regr_optim.items():
scaler.step(optim)
scaler.step(self.frontend_optim)
losses["total"] = tot_loss
self.alphaSG = 1
scaler.update()
if bidx % self.log_freq == 0 or bidx >= self.bpe:
# decrease learning rate
lrs = {}
lrs["frontend"] = self.fe_scheduler(self.frontend_optim, bidx, e, losses["total"].item())
for name, scheduler in self.cls_scheduler.items():
lrs[name] = scheduler(self.cls_optim[name], bidx, e, losses[name].item())
for name, scheduler in self.regr_scheduler.items():
lrs[name] = scheduler(self.regr_optim[name], bidx, e, losses[name].item())
for k in losses.keys():
if k not in lrs:
lrs[k] = 0
EDIT:
Scheduler code
import math
class LR_Scheduler(object):
"""Learning Rate Scheduler
Step mode: ``lr = baselr * 0.1 ^ {floor(epoch-1 / lr_step)}``
Cosine mode: ``lr = baselr * 0.5 * (1 + cos(iter/maxiter))``
Poly mode: ``lr = baselr * (1 - iter/maxiter) ^ 0.9``
Args:
args:
:attr:`args.lr_scheduler` lr scheduler mode (`cos`, `poly`),
:attr:`args.lr` base learning rate, :attr:`args.epochs` number of epochs,
:attr:`args.lr_step`
iters_per_epoch: number of iterations per epoch
"""
def __init__(self, mode, optim_name, base_lr, num_epochs, iters_per_epoch=0,
lr_step=30, warmup_epochs=0):
self.mode = mode
self.name = optim_name
print('Using {} LR Scheduler for {}!'.format(self.mode, optim_name))
self.lr = base_lr
if mode == 'step':
assert lr_step
self.lr_step = lr_step
self.iters_per_epoch = iters_per_epoch
self.N = num_epochs * iters_per_epoch
self.epoch = -1
self.warmup_iters = warmup_epochs * iters_per_epoch
def __call__(self, optimizer, i, epoch, loss):
T = epoch * self.iters_per_epoch + i
lr = self.lr * pow((1 - 1.0 * T / self.N), 0.9)
# warm up lr schedule
if self.warmup_iters > 0 and T < self.warmup_iters:
lr = lr * 1.0 * T / self.warmup_iters
self.epoch = epoch
assert lr >= 0
self._adjust_learning_rate(optimizer, lr)
return lr
def _adjust_learning_rate(self, optimizer, lr):
if len(optimizer.param_groups) == 1:
optimizer.param_groups[0]['lr'] = lr
else:
# enlarge the lr at the head
optimizer.param_groups[0]['lr'] = lr
for i in range(1, len(optimizer.param_groups)):
optimizer.param_groups[i]['lr'] = lr * 10