I try to do gradient accumulation (from this) for RNN model, so I also need to clip gradient. Could you please clarify if I’m doing rigth these two operations in the code below (in the right order and logic)?
loss_function = nn.BCEWithLogitsLoss()
num_batches = 1
running_loss = 0.0
iters_to_accumulate = 4
scaler = GradScaler()
model.train()
for batch in tqdm(train_generator, desc='Training'):
with autocast(device_type=device.type, dtype=torch.float16):
output = torch.flatten(model(batch['features'], batch['category'])).to(device)
batch_loss = loss_function(output, batch['label'].float())
batch_loss = batch_loss / iters_to_accumulate
scaler.scale(batch_loss).backward()
if (num_batches) % iters_to_accumulate == 0:
scaler.unscale_(optimizer)
torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.)
scaler.step(optimizer)
scaler.update()
optimizer.zero_grad()
num_batches += 1