(e) .env In [1] import torch
(e) .env In [2] from torch.autograd import Function
(e) .env In [3] from torch.cuda import comm
(e) .env In [4] a = [torch.tensor([0], device=torch.device(i), requires_grad=True) for i in range(4)]
(e) .env In [5] class Reduce(Function):
@staticmethod
def forward(ctx, inputs):
ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
inputs = sorted(inputs, key=lambda i: i.get_device())
return comm.reduce_add(inputs)
@staticmethod
def backward(ctx, gradOutput):
return Broadcast.apply(ctx.target_gpus, gradOutput)
(e) .env In [6] b = Reduce.apply(a)
(e) .env In [7] b.requires_grad
Out[7] False
(e) .env In [8] class Reduce(Function):
@staticmethod
def forward(ctx, *inputs):
ctx.target_gpus = [inputs[i].get_device() for i in range(len(inputs))]
inputs = sorted(inputs, key=lambda i: i.get_device())
return comm.reduce_add(inputs)
@staticmethod
def backward(ctx, gradOutput):
return Broadcast.apply(ctx.target_gpus, gradOutput)
(e) .env In [9] c = Reduce.apply(*a)
(e) .env In [10] c.requires_grad
Out[10] True
(e) .env In [11]
Could anyone explains why are their behaviors different? Thank you.