Hi,
I am testing p2p communication of torch.distributed.
I have 2 nodes, with gloo backend.
When I isend / irecv multiple tensors with diff ‘tag’, it doesn’t show the expected result.
Could somebody help me of the async p2p?
Node0:
import torch
import torch.distributed as dist
if __name__ == "__main__":
rank = 0
dist.init_process_group(backend="gloo",
init_method='tcp://192.168.1.12:23457',
world_size=2,
rank=rank)
grads={'T0_grad':torch.zeros(2,2),'T1_grad':torch.zeros(2,2),'T2_grad':torch.zeros(2,2),'T3_grad':torch.zeros(2,2),'T4_grad':torch.zeros(2,2)}
if rank ==1:
tmp_tensor = torch.ones(2,2)
req = dist.isend(tmp_tensor,dst=0,tag=0)
print('rank',rank,' dist.isend(tmp_tensor):\n',tmp_tensor)
tmp_tensor2 = torch.ones(2,2)*4
req = dist.isend(tmp_tensor2,dst=0,tag=1)
print('rank',rank,' dist.isend(tmp_tensor2):\n',tmp_tensor2)
time.sleep(6)
elif rank==0:
time.sleep(1)
i = 3
req = dist.irecv(grads['T'+str(i)+'_grad'],src=1,tag=0)
print('rank',rank,' dist.irecv(grads[T'+str(i)+'_grad]):\n',grads['T'+str(i)+'_grad'])
i = 4
req = dist.irecv(grads['T'+str(i)+'_grad'],src=1,tag=1)
print('rank',rank,' dist.irecv(grads[T'+str(i)+'_grad]):\n',grads['T'+str(i)+'_grad'])
Node1:
import torch
import torch.distributed as dist
if __name__ == "__main__":
rank = 1
#... All else equal...
Result:
rank 1 dist.isend(tmp_tensor):
tensor([[1., 1.],
[1., 1.]])
rank 1 dist.isend(tmp_tensor2):
tensor([[4., 4.],
[4., 4.]])
rank 0 dist.irecv(grads[T3_grad]):
tensor([[0., 0.],
[0., 0.]])
rank 0 dist.irecv(grads[T4_grad]):
tensor([[0., 0.],
[0., 0.]])