I am trying to implement a train loop with mixed precision and gradient accumulation but when I do the second step it gives me this error:
Variable._execution_engine.run_backward( # Calls into the C++ engine to run the backward pass
RuntimeError: Trying to backward through the graph a second time (or directly access saved tensors after they have already been freed). Saved intermediate values of the graph are freed when you call .backward() or autograd.grad(). Specify retain_graph=True if you need to backward through the graph a second time or if you need to access saved tensors after calling backward.
This is the code I am using:
# Loop over the dataset multiple times
print("Start Training ...")
while steps < cfg.train.trainer.max_steps:
dt = time.time()
model.train()
print('Loading Batches ...')
for i, data in enumerate(train_dl):
lr, hr = data[0].to(device), data[1].to(device)
with torch.cuda.amp.autocast():
sr, lq = model(lr)
loss_dict = compute_loss(loss_fn, loss_dict, sr, hr, lq)
loss = loss_dict["Loss"] / num_grad_acc
print("Loss:", loss_dict["Loss"].item())
metrics_dict = compute_metric(metric, metrics_dict, sr, hr)
print("Scaling Loss ...")
scaler.scale(loss).backward()
if (i + 1) % num_grad_acc == 0:
print("Updating Parameters at Step {} ...".format(i))
scaler.step(optimizer)
scaler.update()
scheduler.step()
optimizer.zero_grad()