Hi, I’d like to ask if anyone can help about using of torch.cuda.amp for float16 training. When I use it to train (only forward pass and loss computation, config.use_fp16_bp=False), I observe NaN even though I tried many times.
Environment:
Pytorch 1.6 and cuda10.1
Snippet:
with torch.cuda.amp.autocast():
loss = model(imgs, gts)
# reduce the whole loss over multi-gpu
if engine.distributed:
dist.all_reduce(loss, dist.ReduceOp.SUM)
loss = loss / engine.world_size
else:
if len(loss.shape)>1:
loss = Reduce.apply(*loss) / len(loss)
optimizer.zero_grad()
current_idx = epoch * config.niters_per_epoch + idx
lr = lr_policy.get_lr(current_idx)
for i in range(0, len(optimizer.param_groups)):
optimizer.param_groups[i]['lr'] = lr
if config.use_fp16_bp:
# Scales the loss, and calls backward()
# to create scaled gradients
scaler.scale(loss).backward()
# Unscales gradients and calls
# or skips optimizer.step()
scaler.step(optimizer)
# Updates the scale for next iteration
scaler.update()
else:
loss.backward()
optimizer.step()