Use 2 Nvidia GPUs with DistributedDataParallel

I am using 2 Nvidia GPUs for image training with DistributedDataParallel. But getting some unexpected error called

dist._broadcast_coalesced(self.process_group, tensors, buffer_size)
RuntimeError: flock: Input/output error
Aborted (core dumped)

If I use one gpu it works fine with that, but getting the error mentioned above when I use 2 gpus parallelly.

Here are some function given below for initialization:

def create_grids(self, img_size=416, gridsize=(13, 13), device=‘cpu’, type=torch.float32):
“”" calculate a grid, with the defines gridsize over the input-image

    img_size: (width and height) of the input-image
    gridsize: size of the grid projected over the input-image
    device: cpu or gpu-index where to run the calculation on
    type: type of (float) nvidia of cpu to use on device
"""
nx, ny = gridsize  # x and y grid size
try:
    self.img_size = max(img_size) #take the biggest dimentions out of width or height, to calculate stride
except TypeError: #if only one dimention is given, take that
    self.img_size = int(img_size)
self.stride = self.img_size / max(gridsize)

# build xy offsets
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
self.grid_xy = torch.stack((xv, yv), 2).to(device).type(type).view((1, 1, ny, nx, 2))

# build wh gains
self.anchor_vec = self.anchors.to(device) / self.stride
self.anchor_wh = self.anchor_vec.view(1, self.number_of_anchers, 1, 1, 2).to(device).type(type)
self.gridsize = torch.Tensor(gridsize).to(device)
self.nx = nx
self.ny = ny

Precisely, the error comes from the last line of this section below. I initialized my distributed training this way:

# Initialize distributed training
if len(gpu_list) > 1:
    #generate path of a (none existing) file, used to setup the distributed learning
    assert distributed_folder, "The distributed-training folder isnt't set" #check if not the default 0
    distributed_learning_filename = str(Nnet_name) + "_distlearn_setup" #remove this file when program is stopped!!
    distributed_init_filepath = os.path.join(distributed_folder, distributed_learning_filename)
    #there is more than 1 GPU-index than use distributed training
    dist.init_process_group(backend='nccl',  # use distributed backend 'nccl'
                            init_method='file://' + str(distributed_init_filepath), #file used to setup the distributed learning
                            world_size = distributed_world_size, #number of nodes for distributed training
                            rank = distributed_node_rank) #distributed training node rank

    model = torch.nn.parallel.DistributedDataParallel(model, find_unused_parameters=True) 
    model.yolo_layers = model.module.Get_YOLO_layers_list() # move yolo layer indices to top level

Could you give me some idea about the error above or any suggestion why I’m getting those errors and please let me know if you want more info. Looking forward to hearing any suggestions.

Are you using multiple machines?
If not, why don’t you try nn.DataParallel instead

Thanks @lucastononrodrigues for your quick and efficient feedback.
I tried to use nn.DataParallel. Fortunately, it just worked for a sec and throws another error called:

AttributeError: ‘YOLOLayer’ object has no attribute ‘gridsize’

The function related to this are given below:

def create_grids(self, img_size=416, gridsize=(13, 13), device=‘cpu’, type=torch.float32):
“”" calculate a grid, with the defines gridsize over the input-image

    img_size: (width and height) of the input-image
    gridsize: size of the grid projected over the input-image
    device: cpu or gpu-index where to run the calculation on
    type: type of (float) nvidia of cpu to use on device
"""
nx, ny = gridsize  # x and y grid size
try:
    self.img_size = max(img_size) #take the biggest dimentions out of width or height, to calculate stride
except TypeError: #if only one dimention is given, take that
    self.img_size = int(img_size)
self.stride = self.img_size / max(gridsize)

# build xy offsets
yv, xv = torch.meshgrid([torch.arange(ny), torch.arange(nx)])
self.grid_xy = torch.stack((xv, yv), 2).to(device).type(type).view((1, 1, ny, nx, 2))

# build wh gains
self.anchor_vec = self.anchors.to(device) / self.stride
self.anchor_wh = self.anchor_vec.view(1, self.number_of_anchers, 1, 1, 2).to(device).type(type)
self.gridsize = torch.Tensor(gridsize).to(device)
self.nx = nx
self.ny = ny

This error coming from this lines now:

for yololayer in yolo_layers_list:
    # get number of grid points and anchor vec for this yolo layer
    ng, anchor_vec = yololayer.gridsize, yololayer.anchor_vec

The complete function is given below:

def build_targets(model, targets):
# targets = [image, class, x, y, w, h]

number_of_targets = len(targets)
tcls, tbox, indices, av = [], [], [], []

#get the yolo-layers list and the number_of_classes
multi_gpu = type(model) in (nn.parallel.DataParallel, nn.parallel.DistributedDataParallel) #check if this instance of the model runs distributed
if multi_gpu:
    yolo_layers_list = model.module.Get_YOLO_layers_list() #if model runs distributed than the Get_YOLO_layers_list() function is stored under modules
    number_of_classes = model.module.num_classes #if model runs distributed than the num_classes variable is stored under modules
else:
    yolo_layers_list = model.Get_YOLO_layers_list()
    number_of_classes = model.num_classes

#go over the yolo-detector layers in the model
for yololayer in yolo_layers_list:
    # get number of grid points and anchor vec for this yolo layer
    ng, anchor_vec = yololayer.gridsize, yololayer.anchor_vec

    # iou of targets-anchors
    t, a = targets, []
    gwh = t[:, 4:6] * ng
    if number_of_targets:
        # use all anchors
        iou = torch.stack([wh_iou(x, gwh) for x in anchor_vec], 0)
        number_of_anchors = len(anchor_vec)
        a = torch.arange(number_of_anchors).view((-1, 1)).repeat([1, number_of_targets]).view(-1)
        t = targets.repeat([number_of_anchors, 1])
        gwh = gwh.repeat([number_of_anchors, 1])
        iou = iou.view(-1)  # use all ious

        # reject anchors below iou_thres (OPTIONAL, increases P, lowers R)
        j = iou > model.hyp['iou_t']
        t, a, gwh = t[j], a[j], gwh[j]

    # Indices
    b, c = t[:, :2].long().t()  # target image, class
    gxy = t[:, 2:4] * ng  # grid x, y
    gi, gj = gxy.long().t()  # grid x, y indices
    indices.append((b, a, gj, gi))

    # GIoU
    gxy -= gxy.floor()  # xy
    tbox.append(torch.cat((gxy, gwh), 1))  # xywh (grids)
    av.append(anchor_vec[a])  # anchor vec

    # Class
    tcls.append(c)
    if c.shape[0]:  # if any targets
        assert c.max() <= number_of_classes, 'Target classes exceed model classes'

return tcls, tbox, indices, av

I could be wrong but it seems to me that you are not calling create_grids when you initialize the model, you’d need each layer having the self.gridsize defined.

Is the create grids defined inside each layer?

Here is my complete YOLO model given below, could you please specify, if possible:

class YOLOLayer(nn.Module):
    def __init__(self, anchors, number_of_classes, img_size):
        """
            anchors: list of anchers to be used by this detection-layer
            number_of_classes: total number of classes that the network can detect
            img_size: (width, height) of the detection layer(input-image)
        """
        super(MyYoloModel.YOLOLayer, self).__init__()

        self.anchors = torch.FloatTensor(anchors)
        self.number_of_anchers = len(anchors) #number of anchors (3) per Yolo-detection-layer
        self.number_of_classes = number_of_classes # number of classes
        self.nx = 0  # initialize number of x gridpoints
        self.ny = 0  # initialize number of y gridpoints


    def forward(self, p, img_size):
        batch_size, ny, nx = p.shape[0], p.shape[-2], p.shape[-1] #get information about the input matrix

        #check if we need to calculate the grid
        if (self.nx, self.ny) != (nx, ny):
            NNtools.create_grids(self, img_size, (nx, ny), p.device, p.dtype)

        # p.view(batch_size, 255, 13, 13) -- > (batch_size, 3, 13, 13, 85)  # (batch_size, anchors, grid, grid, classes + xywh)
        p = p.view(batch_size, self.number_of_anchers, self.number_of_classes + 5, self.ny, self.nx).permute(0, 1, 3, 4, 2).contiguous()  # prediction

        #check if we are training of running inference on the network
        if self.training:
            return p
        else: # inference
            io = p.clone()  # inference output
            io[..., 0:2] = torch.sigmoid(io[..., 0:2]) + self.grid_xy  # xy
            io[..., 2:4] = torch.exp(io[..., 2:4]) * self.anchor_wh  # wh yolo method
            io[..., :4] *= self.stride

            torch.sigmoid_(io[..., 4:])

            if self.number_of_classes == 1:
                io[..., 5] = 1  # single-class model 

            # reshape from [1, 3, 13, 13, 85] to [1, 507, 85]
            return io.view(batch_size, -1, 5 + self.number_of_classes), p




def Get_YOLO_layers_list(self):
    """
        Returns a list with the layers of the model that are of the 'YOLOLayer'-class
    """
    detector_list = []
    for layer in self.children():
        if type(layer) is self.YOLOLayer:
                detector_list.append(layer) #add the instance of this 'YOLOLayer' to the list

    return detector_list

Inside YOLOlayer you should define the gridsize if you’d want to call it in:

for yololayer in yolo_layers_list:
# get number of grid points and anchor vec for this yolo layer
ng, anchor_vec = yololayer.gridsize, yololayer.anchor_vec

1 Like