Autograd.backward() doesn't trigger reduction for Jacobian vector product

Hi everyone! I found that autograd.backward() doesn’t trigger reduction when I tried to compute certain Jacobian (w.r.t. D network) vector product in the following toy GAN case. However if I use the same approach to compute Jacobian (w.r.t. G network) vector product, DDP works perfectly with autograd.backward(). This is a follow-up post on my previous post where I found a way to compute Jacobian vector product with DDP(ddp-second-backward-accumulate-the-wrong-gradient).

Compute Jacobian vector product w.r.t D networks: as shown below, the results I got from backward is different across different device.

Running on rank 0
Running on rank 1
Hessian vector product of d param: tensor([3., 3., 0.], device='cuda:1')
Hessian vector product of d param: tensor([2., 2., 0.], device='cuda:0')
Done!

The block below is repro code.

import torch
import torch.multiprocessing as mp
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP

from utils.helper import setup, cleanup
from argparse import ArgumentParser

'''
Clean the gradient of each parameter
'''
for p in params:

'''
Collect grads of parameters and concatenate them into a vector.
If grad is None, it will be filled with zeros
:param params: list of parameters
:return: vector
'''
for p in params:
else:
# replace None with zeros

def subprocess_fn(rank, args):
setup(rank, args.num_gpus)
print(f'Running on rank {rank}')
D = nn.Linear(2, 1, bias=True).to(rank)
G = nn.Linear(1, 2, bias=True).to(rank)
# initialize weights
nn.init.constant_(D.weight, 2.0)
nn.init.constant_(D.bias, -1.0)
nn.init.constant_(G.weight, 4.0)
nn.init.constant_(G.bias, 1.0)

if args.distributed:

d_params = list(D.parameters())
g_params = list(G.parameters())

if not args.distributed:
z = torch.tensor([[2.0], [1.0]]).to(rank)
elif rank == 0:
z = torch.tensor([[1.0]]).to(rank)
elif rank == 1:
z = torch.tensor([[2.0]]).to(rank)

loss = D(G(z)).mean()

inputs=d_params)
print(f'Hessian vector product of d param: {hvp_d}')
cleanup()

if __name__ == '__main__':
torch.backends.cudnn.benchmark = True
parser = ArgumentParser()
parser.add_argument('--num_gpus', type=int, help='Number of GPUs', default=1)
args = parser.parse_args()
args.distributed = args.num_gpus > 1

if args.distributed:
mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
else:
subprocess_fn(0, args)

print('Done!')

Compute Jacobian vector product w.r.t G networks: as shown below, the Jacobian vector product is synchronized across devices.

Running on rank 0
Running on rank 1
Hessian vector product of g param: tensor([1.5000, 1.5000, 1.0000, 1.0000], device='cuda:1')
Hessian vector product of g param: tensor([1.5000, 1.5000, 1.0000, 1.0000], device='cuda:0')
Done!

The repro code only switches the order of derivative as attached below.

import torch
import torch.multiprocessing as mp
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP

from utils.helper import setup, cleanup
from argparse import ArgumentParser

'''
Clean the gradient of each parameter
'''
for p in params:

'''
Collect grads of parameters and concatenate them into a vector.
If grad is None, it will be filled with zeros
:param params: list of parameters
:return: vector
'''
for p in params:
else:
# replace None with zeros

def subprocess_fn(rank, args):
setup(rank, args.num_gpus)
print(f'Running on rank {rank}')
D = nn.Linear(2, 1, bias=True).to(rank)
G = nn.Linear(1, 2, bias=True).to(rank)
# initialize weights
nn.init.constant_(D.weight, 2.0)
nn.init.constant_(D.bias, -1.0)
nn.init.constant_(G.weight, 4.0)
nn.init.constant_(G.bias, 1.0)

if args.distributed:

d_params = list(D.parameters())
g_params = list(G.parameters())

if not args.distributed:
z = torch.tensor([[2.0], [1.0]]).to(rank)
elif rank == 0:
z = torch.tensor([[1.0]]).to(rank)
elif rank == 1:
z = torch.tensor([[2.0]]).to(rank)

loss = D(G(z)).mean()

inputs=g_params)   # compute d{torch.dot(gradvec_d, vec)} / d{G}
hvp_g = collect_grad(g_params)  # gather results

print(f'Hessian vector product of g param: {hvp_g}')
cleanup()

if __name__ == '__main__':
torch.backends.cudnn.benchmark = True
parser = ArgumentParser()
parser.add_argument('--num_gpus', type=int, help='Number of GPUs', default=1)
args = parser.parse_args()
args.distributed = args.num_gpus > 1

if args.distributed:
mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
else:
subprocess_fn(0, args)

print('Done!')

Update: there is some typo in the first block. But my question remains the same.

The first code block should be:

import torch
import torch.multiprocessing as mp
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP

from utils.helper import setup, cleanup
from argparse import ArgumentParser

'''
Clean the gradient of each parameter
'''
for p in params:

'''
Collect grads of parameters and concatenate them into a vector.
If grad is None, it will be filled with zeros
:param params: list of parameters
:return: vector
'''
for p in params:
else:
# replace None with zeros

def subprocess_fn(rank, args):
setup(rank, args.num_gpus)
print(f'Running on rank {rank}')
D = nn.Linear(2, 1, bias=True).to(rank)
G = nn.Linear(1, 2, bias=True).to(rank)
# initialize weights
nn.init.constant_(D.weight, 2.0)
nn.init.constant_(D.bias, -1.0)
nn.init.constant_(G.weight, 4.0)
nn.init.constant_(G.bias, 1.0)

if args.distributed:

d_params = list(D.parameters())
g_params = list(G.parameters())

if not args.distributed:
z = torch.tensor([[2.0], [1.0]]).to(rank)
elif rank == 0:
z = torch.tensor([[1.0]]).to(rank)
elif rank == 1:
z = torch.tensor([[2.0]]).to(rank)

loss = D(G(z)).mean()
inputs=d_params)
print(f'Hessian vector product of d param: {hvp_d}')
cleanup()

if __name__ == '__main__':
torch.backends.cudnn.benchmark = True
parser = ArgumentParser()
parser.add_argument('--num_gpus', type=int, help='Number of GPUs', default=1)
args = parser.parse_args()
args.distributed = args.num_gpus > 1

if args.distributed:
mp.spawn(subprocess_fn, args=(args, ), nprocs=args.num_gpus)
else:
subprocess_fn(0, args)

print('Done!')