Performing gradient accumulation with Accelerate

Thank you so much. I have try to correct it. but still same…

for epoch in range(init_epoch, args.num_epoch + 1):
    for iteration, (x, y) in enumerate(data_loader):
        x_0 = x.to(device, dtype=dtype, non_blocking=True)
        y = None if not use_label else y.to(device, non_blocking=True)
        #model.zero_grad() **delete here**
        if is_latent_data:
            z_0 = x_0 * args.scale_factor
        else:
            z_0 = first_stage_model.encode(x_0).latent_dist.sample().mul_(args.scale_factor)
        # sample t
        t = torch.rand((z_0.size(0),), dtype=dtype, device=device)
        t = t.view(-1, 1, 1, 1)
        z_1 = torch.randn_like(z_0)
        # 1 is real noise, 0 is real data
        z_t = (1 - t) * z_0 + (1e-5 + (1 - 1e-5) * t) * z_1
        u = (1 - 1e-5) * z_1 - z_0
        # estimate velocity
        v = model(t.squeeze(), z_t, y)
        loss = F.mse_loss(v, u)
        loss = loss.mean()
        accelerator.backward(loss)

        # change below
        if (iteration + 1) % args.gradient_accumulation_steps == 0:
            optimizer.step()
            scheduler.step()
            global_step += 1
            log_steps += 1
            optimizer.zero_grad()

another one advice me to use accumulation wrapper . I am so confused now…
Performing gradient accumulation with Accelerate - 🤗Accelerate - Hugging Face Forums