I share here two lists of codes to reproduce the issue. The first one is a program with DP, which is provided for comparison. The second one is with DDP, which takes longer for forward and backward path than DP.
DP
"""  Training Resnet34 for Cifar10 by Data Parallel """
from __future__ import print_function
import torch
import torch.nn as nn
import torch.optim as optim
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
import sys
import time
import argparse
from models import *
from sync_batchnorm import convert_model, DataParallelWithCallback
def main() :
    
    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
    parser.add_argument('--net', default='res34')
    parser.add_argument('--batch_size', default=4096)
    parser.add_argument('--optimizer', default="Adam")
    parser.add_argument('--epochs', default=2)
    parser.add_argument('--n_nodes', default=1) 
    parser.add_argument('--nr', default=0)
    args = parser.parse_args()
    if torch.cuda.is_available() :
        args.n_gpus = torch.cuda.device_count()
        print(args.n_gpus, " GPU(s) available")
        print(torch.cuda.get_device_name(0))
        
    else :
        print("GPU is NOT available.")   
        sys.exit()
        
    print("Total batch size = ", args.batch_size)    
    print("Batch size = ", int(args.batch_size / args.n_gpus), "/ GPU")    
    print("Optimizer = ", args.optimizer)
    
    train(args)
    print()
       
# Training
def train(args):
    
    epochs = args.epochs
    batch_size = args.batch_size    # total batch_size.
    n_gpus = args.n_gpus
    
    worker = 8
      
    if args.net=='res18':
        net = ResNet18()
    elif args.net=='res34':
        net = ResNet34()
    elif args.net=='res50':
        net = ResNet50()
    elif args.net=='res101':
        net = ResNet101()
    
    print("Model = ", net.__class__.__name__)
    print()
    
    d_list = list(range(n_gpus))        
    net = convert_model(net).cuda() # Convert BatchNorm into SyncBatchNorm
    net = DataParallelWithCallback(net, device_ids = d_list) # Data Parallel
      
    cudnn.benchmark = True  
    
    criterion = nn.CrossEntropyLoss()
    
    if args.optimizer == "Adam" :
        optimizer = optim.Adam(net.parameters())
        
    elif args.optimizer == "SGD" :
        optimizer = optim.SGD(net.parameters(), lr = 0.1)
       
    transform_list = [
                  transforms.RandomChoice([
                  transforms.RandomCrop(32, padding=4),
                  transforms.RandomResizedCrop(32, scale=(0.7, 1.0), ratio = (1.0, 1.0)),
                  ]),
                  transforms.RandomHorizontalFlip(),
                  transforms.RandomRotation(degrees = 20), 
                  transforms.ToTensor(),
                  transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                  ]
                  
    transform_train = transforms.Compose(transform_list)
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size, shuffle=True, num_workers=worker)
    for epoch in range(epochs):
        
        print()
        print("epoch : ",epoch + 1, " / ", epochs)
        net.train()        
        
        """   ------- Training loop  -------- """
   
        for batch_idx, (inputs, targets) in enumerate(trainloader):
          
            inputs, targets = inputs.to('cuda'), targets.to('cuda')    
            
            message = ""
            t0 = time.time() 
            
            optimizer.zero_grad()      
            
            t1 = time.time() 
            message += "  zero grad: {0:.5f}".format(t1 - t0)
            
            outputs = net(inputs)
            
            t2 = time.time() 
            message += "  out: {0:.5f}".format(t2 - t1)
            
            loss = criterion(outputs, targets)
            
            t3 = time.time() 
            message += "  loss: {0:.5f}".format(t3 - t2)
            
            loss.backward()
            
            t4 = time.time() 
            message += "  back: {0:.5f}".format(t4 - t3)
            
            loss_val = optimizer.step(loss.item)  # loss value is given through optimizer.
            t5 = time.time() 
            message += "  step: {0:.5f}".format(t5 - t4)
                 
            print("{0:.6f}".format(loss_val) + message)                    
if __name__ == '__main__':
    main()
    
DDP
"""  Training Resnet34 for Cifar10 by Distributed Data Parallel """
from __future__ import print_function
import torch.multiprocessing as mp
import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torch.backends.cudnn as cudnn
import torchvision
import torchvision.transforms as transforms
import sys
import os
import time
import argparse
from models import *
from sync_batchnorm import convert_model, DataParallelWithCallback
def main() :
    
    parser = argparse.ArgumentParser(description='PyTorch CIFAR10 Training')
    parser.add_argument('--net', default='res34')
    parser.add_argument('--batch_size', default=4096)
    parser.add_argument('--optimizer', default="Adam")
    parser.add_argument('--epochs', default=1)
    parser.add_argument('--n_nodes', default=1) 
    parser.add_argument('--nr', default=0)
    args = parser.parse_args()
    if torch.cuda.is_available() :
        args.n_gpus = torch.cuda.device_count()
        print(args.n_gpus, " GPU(s) available")
        print(torch.cuda.get_device_name(0))
        
        # for DDP
        args.world_size = args.n_gpus * args.n_nodes
        os.environ['MASTER_ADDR'] = 'localhost'
        os.environ['MASTER_PORT'] = '8888' 
    
    else :
        print("GPU is NOT available.")   
        sys.exit()
        
    print("Total batch size = ", args.batch_size)
    
    args.batch_size = int(args.batch_size / args.world_size) # for DDP
    print("Batch size = ", args.batch_size, "/ GPU")
    
    print("Optimizer = ", args.optimizer)
    
    """ Distributed Data Parallel (DDP)"""
    mp.spawn(train, nprocs=args.n_gpus, args=(args,)) 
    print()
       
# Training
def train(gpu, args):
    
    rank = args.nr * args.n_gpus + gpu	                          
    dist.init_process_group(                                   
    	backend='nccl',                                         
   		init_method='env://',                                   
    	world_size=args.world_size,                              
    	rank=rank                                               
    )                                                          
    epochs = args.epochs
    batch_size = args.batch_size    # batch_size is per GPU size.
  
    torch.manual_seed(0)
    
    if args.net=='res18':
        net = ResNet18()
    elif args.net=='res34':
        net = ResNet34()
    elif args.net=='res50':
        net = ResNet50()
    elif args.net=='res101':
        net = ResNet101()
    
    if rank == 0 :         
        print("Model = ", net.__class__.__name__)
        print()
    
    torch.cuda.set_device(gpu)    
    
    net = torch.nn.SyncBatchNorm.convert_sync_batchnorm(net)
    net = net.cuda(gpu)
    
    criterion = nn.CrossEntropyLoss().cuda(gpu)
    if args.optimizer == "Adam" :
        optimizer = optim.Adam(net.parameters())
        
    elif args.optimizer == "SGD" :
        optimizer = optim.SGD(net.parameters(), lr = 0.1)
 
    net = nn.parallel.DistributedDataParallel(net, device_ids=[gpu])
    transform_list = [
                  transforms.RandomChoice([
                  transforms.RandomCrop(32, padding=4),
                  transforms.RandomResizedCrop(32, scale=(0.7, 1.0), ratio = (1.0, 1.0)),
                  ]),
                  transforms.RandomHorizontalFlip(),
                  transforms.RandomRotation(degrees = 20), 
                  transforms.ToTensor(),
                  transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
                  ]
                  
    transform_train = transforms.Compose(transform_list)
     
    trainset = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
    train_sampler = torch.utils.data.distributed.DistributedSampler(
    	trainset,
    	num_replicas = args.world_size,
    	rank = rank
    )
    trainloader = torch.utils.data.DataLoader(trainset, batch_size = batch_size, 
                                              shuffle=False, num_workers=0,
                                              pin_memory = False, sampler=train_sampler)
    for epoch in range(epochs):
        
        if rank == 0 :
            print()
            print("epoch : ",epoch + 1, " / ", epochs)
        net.train()        
        
        """   ------- Training loop  -------- """
   
        for batch_idx, (inputs, targets) in enumerate(trainloader):
                      
            inputs = inputs.cuda(non_blocking=True)
            targets = targets.cuda(non_blocking=True)
            
            message = ""
            t0 = time.time() 
            
            optimizer.zero_grad()      
            
            t1 = time.time() 
            message += "  zero grad: {0:.5f}".format(t1 - t0)
            
            outputs = net(inputs)
            
            t2 = time.time() 
            message += "  out: {0:.5f}".format(t2 - t1)
            
            loss = criterion(outputs, targets)
            
            t3 = time.time() 
            message += "  loss: {0:.5f}".format(t3 - t2)
            
            loss.backward()
            
            t4 = time.time() 
            message += "  back: {0:.5f}".format(t4 - t3)
            
            loss_val = optimizer.step(loss.item)  # loss value is given through optimizer.
            t5 = time.time() 
            message += "  step: {0:.5f}".format(t5 - t4)
                 
            if rank == 0 :
                print("{0:.6f}".format(loss_val) + message)                    
        dist.destroy_process_group()
if __name__ == '__main__':
    main()
    
Please let me know if something is wrong. Thank you.