Efficiently computing per-example gradient

I have a code for efficiently computing per-example gradient.

  def compute_loss(self, params, buffers, sample, target):
      batch = sample.unsqueeze(0)
      targets = target.unsqueeze(0)
      predictions = functional_call(self.model, (params, buffers), (batch,))
      loss = self.criterion(predictions, targets)
      return loss

  def estimate_grad_incoh(self):
    num_trainset = len(self.train_dataset)
    subsample_size = 2000
    subsample_idx = np.random.choice(a=range(num_trainset), size=subsample_size,
    data_subsampled = data_utils.Subset(train_dataset, subsample_idx)
    params = {k: v.detach() for k, v in self.model.named_parameters()}
    buffers = {k: v.detach() for k, v in self.model.named_buffers()}
    with torch.no_grad():
      ft_compute_grad = grad(self.compute_loss)
    ft_compute_sample_grad = vmap(ft_compute_grad, in_dims=(None, None, 0, 0))
    data_loader = torch.utils.data.DataLoader(data_subsampled, batch_size=subsample_size)
    for x, t in data_loader:
      x, t = x.to(device), t.to(device)
      if self.model_type == 'mlp':
          x = x.reshape(-1, self.input_size)
      ft_per_sample_grads = ft_compute_sample_grad(params, buffers, x, t)

However, when I wrap optimizer and model using privacy engine I got the following error

AttributeError: 'Tensor' object has no attribute '_forward_counter'

It is interesting that without private engine I do not get any error.

Any idea?