From distributed to gradient accumulation

External_happy · February 9, 2024, 7:44am

This code was run perfect on multiple gpu, however, I want to adapted on a single gpu by using gradient accumulation. How to do that ?

class GatherLayer(torch.autograd.Function):
    """
    Gather tensors from all workers with support for backward propagation:
    This implementation does not cut the gradients as torch.distributed.all_gather does.
    """
    @staticmethod
    def forward(ctx, x):
        output = [torch.zeros_like(x) for _ in range(torch.distributed.get_world_size())]
        torch.distributed.all_gather(output, x)
        return tuple(output)
    @staticmethod
    def backward(ctx, *grads):
        all_gradients = torch.stack(grads)
        torch.distributed.all_reduce(all_gradients)
        return all_gradients[torch.distributed.get_rank()]
def all_gather_with_grad(tensors):
    """
    Performs all_gather operation on the provided tensors.
    Graph remains connected for backward grad computation.
    """
    # Queue the gathered tensors
    world_size = torch.distributed.get_world_size()
    # There is no need for reduction in the single-proc case
    if world_size == 1:
        return tensors

    tensor_all = GatherLayer.apply(tensors)
    return torch.cat(tensor_all, dim=0)


def train(….):
    with amp_autocast():    
            logits = model(images)
            probs = F.softmax(logits,dim=-1) 
            probs_all = utils.all_gather_with_grad(probs)
            probs_batch_avg = probs_all.mean(0) # average prediction probability across all gpus
            probs_avg = probs_batch_avg
            loss_xx = -(torch.log(probs_avg)).mean()
            loss = loss_xx + .......

loss.backward()