This code was run perfect on multiple gpu, however, I want to adapted on a single gpu by using gradient accumulation. How to do that ?
class GatherLayer(torch.autograd.Function):
"""
Gather tensors from all workers with support for backward propagation:
This implementation does not cut the gradients as torch.distributed.all_gather does.
"""
@staticmethod
def forward(ctx, x):
output = [torch.zeros_like(x) for _ in range(torch.distributed.get_world_size())]
torch.distributed.all_gather(output, x)
return tuple(output)
@staticmethod
def backward(ctx, *grads):
all_gradients = torch.stack(grads)
torch.distributed.all_reduce(all_gradients)
return all_gradients[torch.distributed.get_rank()]
def all_gather_with_grad(tensors):
"""
Performs all_gather operation on the provided tensors.
Graph remains connected for backward grad computation.
"""
# Queue the gathered tensors
world_size = torch.distributed.get_world_size()
# There is no need for reduction in the single-proc case
if world_size == 1:
return tensors
tensor_all = GatherLayer.apply(tensors)
return torch.cat(tensor_all, dim=0)
def train(….):
with amp_autocast():
logits = model(images)
probs = F.softmax(logits,dim=-1)
probs_all = utils.all_gather_with_grad(probs)
probs_batch_avg = probs_all.mean(0) # average prediction probability across all gpus
probs_avg = probs_batch_avg
loss_xx = -(torch.log(probs_avg)).mean()
loss = loss_xx + .......
loss.backward()