Using multiple gpus (ddp pytorch) and showing loss graphs by dis.reduce

Hi All, I am using ddp pytorch for fine tunning my model. I am wondering how I can save the average of loss function from all gpus for showing the loss graph. when I printing the loss in the code, it shows me three losses from 3 gpus which make sense. but for graph I need to reduce the loss is the following code correct to apply? is the definition of “avg_train_loss_reduced” correct to use as a final loss for graph? many thanks for your feedback.

    model = copy.deepcopy(model_or)

    model=model.to(gpu_id)
    model = DDP(model, device_ids=[gpu_id])
    print("gpu_id",gpu_id)
    # ========================================
    #               Training
    # ========================================
          
    for epoch_i in range(0, total_epochs):
        
        print("")
        print('======== Epoch {:} / {:} ========'.format(epoch_i + 1, total_epochs))
        print('Training...')

        ##########################################
        train_loader.sampler.set_epoch(epoch_i)
        b_sz = len(next(iter(train_loader))[0])
        print(f"[GPU{gpu_id}] Epoch {epoch_i} | Batchsize: {b_sz} | Steps: {len(train_loader)}")
        train_loader.sampler.set_epoch(epoch_i)
        ##########################################
        total_train_loss = 0

        model.train()

        for step, batch in enumerate(train_loader):
            print("len(train_loader)",len(train_loader))
            #################################
            b_input_ids = batch[0].to(gpu_id,non_blocking=True)
            b_labels = batch[0].to(gpu_id,non_blocking=True)
            b_masks = batch[1].to(gpu_id,non_blocking=True)
            #################################

            optimizer.zero_grad()        

            outputs = model(  b_input_ids,
                             labels=b_labels, 
                              attention_mask = b_masks,
                              token_type_ids=None
                            )

            loss = outputs[0]  
            batch_loss = loss.item()
            total_train_loss += batch_loss
            loss.backward()
            optimizer.step()
            scheduler.step()
        
         # Calculate the average loss over all of the batches.

        avg_train_loss = total_train_loss / len(train_loader)  

        ## reduce the average loss from 3 gpus  and get sum
        dist.reduce(avg_train_loss, 0, op=dist.ReduceOp.SUM)
        
        ### devide by 3 because I used 3 gpus
        avg_train_loss_reduced=avg_train_loss/3

        Path_3=pt_save_directory+'/'+'avg_train_loss_reduced='+str(gpu_id)+".csv"
        torch.save(avg_train_loss_reduced,Path_3)