Hello,
I am trying to do gradient accumulation
model.zero_grad() # Reset gradients tensors
for i, (inputs, labels) in enumerate(training_set):
predictions = model(inputs) # Forward pass
loss = loss_function(predictions, labels) # Compute loss function
loss = loss / accumulation_steps # Normalize our loss (if averaged)
loss.backward() # Backward pass
if (i+1) % accumulation_steps == 0: # Wait for several backward steps
optimizer.step() # Now we can do an optimizer step
model.zero_grad() # Reset gradients tensors
if (i+1) % evaluation_steps == 0: # Evaluate the model when we...
evaluate_model()
as mentioned here
But my model is on multiple GPUs
So when my loss is output from the model how do I scale it? also I want to clip grad norms while training it. How should my training loop modify?
My code is this, does this looks correct?
for i, batch in enumerate(train_loader_with_distributed_sampler):
for param_group in optimizer.param_groups:
param_group['lr'] = learning_rate
x, y = batch
y_hat = model(x)
loss = criterion(y_hat, y).mean()
loss = loss / hparams.gradient_accumulation_step
reduced_loss = reduce_tensor(loss.data, n_gpus).item()
loss.backward()
current_accumulation_run = iteration % hparams.gradient_accumulation_step + 1
# my grad clip thres is 1.0 so it will be multiplied with 1, 2, 3, 4, 5 based on my gradient accumulation step size
# Or maybe I don't need to manage this?
grad_norm = torch.nn.utils.clip_grad_norm_(
model.parameters(), hparams.grad_clip_thresh * current_accumulation_run)
grad_norm = grad_norm
if (i + 1) % hparams.gradient_accumulation_step == 0:
optimizer.step()
model.zero_grad()
if rank == 0:
print("Optimizing Step")
print("Train loss {} {:.6f} Grad Norm {:.6f}".format(
i, reduced_loss, grad_norm))