How does one reuse the autograd computational graph

bradbell · October 22, 2023, 12:32pm

I tried posting this as a continuation of the following post and did not get a response:

Consider the example case of computing determinants using expansion by minors.
I know this is not an efficient way to compute determinants, but it demonstrates a case that has lots of operations on scalar values.

I am trying to re-use the graph for this case and my method of doing so works for 2 by 2 matrices and fails for 3 by 3 matrices; see the pythong programs below, What am I doing wrong ?

det_22.py

# Test reusing graph for derivatives of determinant of 2 by 2 matrix.
# The output generated by this program is below:
#
# First gradient passed check.
# Second gradient passed check.
#
# imports
import sys
import torch
import numpy
#
# check_grad
def check_grad(ax) :
   # ok, eps99
   ok    = True
   eps99 = 99.0 * numpy.finfo(float).eps
   #
   # ok
   check = ax.data[1,1]
   if abs( ax.grad[0,0] - check ) > eps99 :
      ok = False
      print( f'ax.grad[0,0] = {ax.grad[0,0]}, check = {check}' )
   #
   # ok
   check = -ax.data[1,0]
   if abs( ax.grad[0,1] - check ) > eps99 :
      ok = False
      print( f'ax.grad[0,1] = {ax.grad[0,1]}, check = {check}' )
   #
   # ok
   check = -ax.data[0,1]
   if abs( ax.grad[1,0] - check ) > eps99 :
      ok = False
      print( f'ax.grad[1,0] = {ax.grad[1,0]}, check = {check}' )
   #
   # ok
   check = ax.data[0,0]
   if abs( ax.grad[1,1] - check ) > eps99 :
      ok = False
      print( f'ax.grad[1,1] = {ax.grad[1,1]}, check = {check}' )
   #
   return ok
#
# main
def main() :
   #
   # ok
   ok = True
   #
   # n
   n = 2
   #
   # ax
   x  = numpy.random.uniform(0.0, 1.0, (n , n) )
   ax = torch.tensor(x, requires_grad = True)
   #
   # az
   az  = ax[0,0] * ax[1,1] - ax[0,1] * ax[1,0]
   #
   # ax.grad
   az.backward(retain_graph = True)
   #
   # check_grad
   if check_grad(ax) :
      print( 'First gradient passed check.' )
   else :
      print( 'First gradient failed check.' )
   #
   #
   # ax.data
   x  = numpy.random.uniform(0.0, 1.0, (n, n) )
   for i in range(n) :
      for j in range(n) :
         ax.data[i,j] = x[i,j]
   #
   # ax.grad
   ax.grad.zero_()
   az.backward(retain_graph = True)
   #
   # check_grad
   if check_grad(ax) :
      print( 'Second gradient passed check.' )
   else :
      print( 'Second gradient failed check.' )
#
main()

det_33.py

# Test reusing graph for derivatives of determinant of 3 by 3 matrix.
# The output generated by this program is below. The actual numbers
# in the output will vary because a different random matrix is chosen
# for each evaluation.
#
# First gradient passed check.
# ax.grad[0,0] = 0.07585514040844837, check = 0.4295608074373773
# ax.grad[0,1] = -0.6133183512861293, check = -0.11782369019260797
# ax.grad[0,2] = 0.5337097801031835, check = 0.040633019648616306
# Second gradient failed check.
#
#
# imports
import torch
import numpy
#
# check_grad
def check_grad(ax) :
   # ok, eps99
   ok    = True
   eps99 = 99.0 * numpy.finfo(float).eps
   #
   # ok
   check = ( ax[1,1] * ax[2,2] - ax[1,2] * ax[2,1] )
   if abs( ax.grad[0,0] - check ) > eps99 :
      ok = False
      print( f'ax.grad[0,0] = {ax.grad[0,0]}, check = {check}' )
   #
   # ok
   check = - ( ax[1,0] * ax[2,2] - ax[1,2] * ax[2,0] )
   if abs( ax.grad[0,1] - check ) > eps99 :
      ok = False
      print( f'ax.grad[0,1] = {ax.grad[0,1]}, check = {check}' )
   #
   # ok
   check = ( ax[1,0] * ax[2,1] - ax[1,1] * ax[2,0] )
   if abs( ax.grad[0,2] - check ) > eps99 :
      ok = False
      print( f'ax.grad[0,2] = {ax.grad[0,2]}, check = {check}' )
   #
   return ok
#
# main
def main() :
   #
   # ok
   ok = True
   #
   # n
   n = 3
   #
   # ax
   x  = numpy.random.uniform(0.0, 1.0, (n, n))
   ax = torch.tensor(x, requires_grad = True)
   #
   #  ax[0,0]  ax[0,1]  ax[0,2]
   #  ax[1,0]  ax[1,1]  ax[1,2]
   #  ax[2,0]  ax[2,1]  ax[2,2]
   #
   # az
   az  = ax[0,0] * ( ax[1,1] * ax[2,2] - ax[1,2] * ax[2,1] )
   az -= ax[0,1] * ( ax[1,0] * ax[2,2] - ax[1,2] * ax[2,0] )
   az += ax[0,2] * ( ax[1,0] * ax[2,1] - ax[1,1] * ax[2,0] )
   #
   #
   # ax.grad
   az.backward(retain_graph = True)
   #
   # check_grad
   if check_grad(ax) :
      print( 'First gradient passed check.' )
   else :
      print( 'First gradient failed check.' )
   #
   #
   # ax.data
   x  = numpy.random.uniform(0.0, 1.0, (n, n) )
   for i in range(n) :
      for j in range(n) :
         ax.data[i,j] = x[i,j]
   #
   # ax.grad
   ax.grad.zero_()
   az.backward(retain_graph = True)
   #
   # check_grad
   if check_grad(ax) :
      print( 'Second gradient passed check.' )
   else :
      print( 'Second gradient failed check.' )
#
main()

KFrank · October 22, 2023, 7:22pm

Hi Brad!

The short story is that you are using .data which is deprecated (in the
public-facing api) and can lead to errors (such as yours).

This works for your 2-by-2 case, in essence, by accident.

and fails for 3 by 3 matrices;
…
What am I doing wrong ?
…
det_33.py

def main() :
   ...
   az  = ax[0,0] * ( ax[1,1] * ax[2,2] - ax[1,2] * ax[2,1] )
   az -= ax[0,1] * ( ax[1,0] * ax[2,2] - ax[1,2] * ax[2,0] )
   az += ax[0,2] * ( ax[1,0] * ax[2,1] - ax[1,1] * ax[2,0] )

When you compute az, autograd stores some of the product tensors,
e.g., ax[1,1] * ax[2,2], as temporary values in the computation graph
itself, for use during the backward pass.

By using .data (whose public use is deprecated), you hide the modification
of ax from autograd. The temporary tensor stored in the computation graph
is now out of date with respect to the modified value of ax (but autograd
doesn’t know this). So when you call az.backward() the second time,
autograd uses the stale temporary tensor and computes an incorrect gradient.

Consider:

>>> import torch
>>> print (torch.__version__)
2.1.0
>>>
>>> a = torch.tensor ([1.0, 2.0, 3.0], requires_grad = True)
>>>
>>> # autograd will save  a[0] * a[1]  as an intermediate result in the computation graph
>>> # let's call it  tmp = tensor(2.0, grad_fn=<MulBackward0>)
>>> # it will be used during p.backward() to compute  a.grad[2] = 2.0
>>> p = a[0] * a[1] * a[2]
>>>
>>> p.backward (retain_graph = True)
>>> a.grad
tensor([6., 3., 2.])
>>>
>>> x = torch.tensor ([10.0, 20.0, 30.0])
>>> for  i in range (3):
...     a.data[i] = x[i]   # using .data tricks autograd
...
>>> a.grad.zero_()
tensor([0., 0., 0.])
>>> p.backward()           # uses stale tmp to compute incorrect value for a.grad[2]
>>> a.grad
tensor([600., 300.,   2.])
>>>
>>> # let's not trick autograd
>>> y = torch.tensor ([2.0, 4.0, 6.0])
>>> for  i in range (3):
...     a[i] = x[i]        # no .data -- autograd works and catches your error
...
Traceback (most recent call last):
  File "<stdin>", line 2, in <module>
RuntimeError: a view of a leaf Variable that requires grad is being used in an in-place operation.

To be clear, you can’t reuse the computation graph in the way you propose
because, in addition to storing a record of the forward-pass operations, it
also stores various data-dependent intermediate tensors for use in the
backward pass. When the input data changes, you have to rebuild a new
computation graph (that is, perform the forward pass again), because you
have to compute new values for those data-dependent intermediate tensors.

Best.

K. Frank