Are the communication collectives of torch.cuda.comm synchronized?

I think they are.

By the way, I wrote a simple script to verify this, but it complains segmentation fault, I do not know why.

import torch
from torch import nn
import torch.cuda
import time


class Tmp(nn.Module):
    def __init__(self):
        nn.Module.__init__(self)
        self.x = nn.Parameter(torch.empty(1)).cuda()

    def forward(self):
        if torch.cuda.current_device() == 0:
            print('I am device 0')
            a = torch.Tensor([0]).cuda()
            print('I am device 0, and a is {}'.format(a))
            print('I am device 0, and I sleep')
            time.sleep(10)
            torch.cuda.comm.broadcast(a, (0, 1))
            print('I am device 0, and I wake and broadcast, and a is {}'.format(a))
        else:
            print('I am device 1')
            a = torch.Tensor([1]).cuda()
            print('I am device 1, and a is {}'.format(a))
            print('I am device 1, and I do not sleep')
            torch.cuda.comm.broadcast(a, (0, 1))
            print('I am device 1, and I broadcast, and a is {}'.format(a))

a = Tmp()
b = torch.nn.DataParallel(a)
b()