Hi everyone,
I am trying to train a model with one machine, but with multi gpus. Until now, I was using the nn.DataParallel which works well, but it seems a bit slow to me so I would like to use the DistributedDataParallel instead.
However, I am not sure to understand clearly how to use this function (I have some weird results, the training takes 10x much more time than DataParallel).
In fact, I am not sure which gpus have to load the model/batch and compute the loss function ?
Moreover with the code below, my training is slower and I saw on nvidia-smi a weird behavior. Instead of having ONE process on each gpu, I have two process for each gpu (I have two gpus, but I have 4 process) .
My second issue is if I increase the number of workers in the dataloader, I have a dataloader pid killed error .
Am I doing something wrong ?
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
import torch.multiprocessing as mp
from apex.parallel import DistributedDataParallel as DDP_apex
from torch.nn.parallel import DistributedDataParallel as DDP
def run(gpu, args):
rank = gpu
dist.init_process_group(
backend='nccl',
init_method='tcp://localhost:1088', #'env://',
world_size=args.world_size,
rank=rank
)
trainset = ...
testset = ...
################################################################
train_sampler = torch.utils.data.distributed.DistributedSampler(
trainset,
num_replicas=args.world_size,
rank=rank
)
################################################################
test_sampler = torch.utils.data.distributed.DistributedSampler(
testset,
num_replicas=args.world_size,
rank=rank
)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=args.batch_size, shuffle=True, num_workers=args.workers, pin_memory=False, drop_last= True )
testloader = torch.utils.data.DataLoader(testset, batch_size=args.batch_size, shuffle=False, num_workers=args.workers, pin_memory=False)
optim_params = list(filter(lambda p: p.requires_grad, net.parameters()))
optimizer = optim.Adam(optim_params, lr=args.lr, betas=(0.9, 0.999), eps=1e-08, weight_decay=args.weight_decay, amsgrad=True)
net = 2D_CNN()
net = net.to(args.gpus[0])
net = DDP(net, device_ids=args.gpus)
train(net, optimizer, trainloader, testloader, args, gpu) # function which iterate accross the dataloader and do the forward/backward/step
if __name__ =="__main__":
args.nodes = 1 # one single machine
args.gpus = [0,1,2]
#########################################################
args.world_size = len(args.gpus) * args.nodes #
os.environ['MASTER_ADDR'] = 'localhost' #
os.environ['MASTER_PORT'] = '8888' #
print(args.gpus)
print(os.environ['MASTER_PORT'] )
mp.spawn(run, nprocs=len(args.gpus), args=(args,)) #