Hi All.
The task I have is to do dist.gather
on tensors
of variable size. This happens during prediction stage: often multiple tensors size differ from others by 1. The idea was to pass tensor sizes to destination rank, use these sizes to prepare gather_list
and now do dist.gather
having proper tensor sizes. However this does not work with error: ValueError: ProcessGroupGloo::gather: invalid tensor size at index 1 (expected (2), got (3))
. Changing tensors position in gather_list does not help. Reproducible code:
import argparse
import torch
import torch.distributed as dist
parser = argparse.ArgumentParser(description="dist.gather test")
parser.add_argument('--local_rank', type=int, default=0)
opts = parser.parse_args()
torch.cuda.set_device(opts.local_rank)
dist.init_process_group(backend="gloo", group_name='test')
group = torch.distributed.group.WORLD
if torch.distributed.get_rank(group) == 0:
tensor = torch.Tensor([1, 2])
else:
tensor = torch.Tensor([4, 5, 6])
gather_list_sizes = []
if torch.distributed.get_rank(group) == 0:
gather_list_sizes = [torch.zeros(1), torch.zeros(1)]
dist.gather(
tensor=torch.Tensor([tensor.size()[0]]),
gather_list=gather_list_sizes,
dst=0,
group=group
)
if torch.distributed.get_rank(group) == 0:
print(gather_list_sizes)
gather_list_tensors = []
if torch.distributed.get_rank(group) == 0:
gather_list_tensors = [torch.zeros(int(size.numpy()[0])) for size in gather_list_sizes]
print(gather_list_tensors)
dist.gather(
tensor=tensor,
gather_list=gather_list_tensors,
dst=0,
group=group,
async_op=True
)
if torch.distributed.get_rank(group) == 0:
print(gather_list_tensors)