Here is my code:
if opt.dist_gpu:
from torch.distributed import distributed_c10d as dist
rank = int(os.getenv('RANK', 0))
world_size = int(os.getenv('WORLD_SIZE', 1))
local_rank = int(os.getenv('LOCAL_RANK', 0))
print('rank is %d, world_size is %d'%(rank, world_size))
torch.cuda.set_device(local_rank)
dist.init_process_group(
backend="nccl",
init_method=opt.dist_url,
world_size=world_size,
rank=rank,
)
dist.barrier()
opt.gpu_ids = range(world_size)
print("cfg.gpu_ids:", opt.gpu_ids)
device = torch.device(f'cuda:{local_rank}' if torch.cuda.is_available() else 'cpu')
if opt.dist_gpu:
train_sampler = DistributedSampler(dataset=opencood_train_dataset, shuffle=True)
train_batch_sampler = torch.utils.data.BatchSampler(
train_sampler, hypes['train_params']['batch_size'], drop_last=True)
train_loader = DataLoader(opencood_train_dataset,
num_workers=4,
collate_fn=opencood_train_dataset.collate_batch_train,
pin_memory=True,
prefetch_factor=2,
batch_sampler=train_batch_sampler)
val_sampler = DistributedSampler(dataset=opencood_validate_dataset, shuffle=False)
val_loader = DataLoader(opencood_validate_dataset,
batch_size=hypes['train_params']['batch_size'],
num_workers=4,
collate_fn=opencood_train_dataset.collate_batch_train,
pin_memory=True,
drop_last=False,
prefetch_factor=2,
sampler=val_sampler)
if opt.dist_gpu:
print('distributed training')
model = model.cuda(local_rank)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[local_rank], find_unused_parameters=True)
It’s so strange, because when I trained in a single GPU, performance is normal, but in DDP training, model is very terrible. I don’t know why.