Hi, correct me if I’m wrong but I found that Dropout behaves similarly (correlated) across different GPUs when using DDP. In other word, cells at same position in tensor in different GPUs get all dropped out or not dropped out.
I believe this (might) makes the training loss reduce slower than when using single GPU training.
Is this a bug? and is there a quick fix to make dropout layers operate independently across GPUs?
Example code
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.distributed import DistributedSampler
from torch.nn.parallel import DistributedDataParallel as DDP
import random
import torch.distributed as dist
import os
def set_seed(seed):
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
class CustomDataset(Dataset):
def __init__(self, size=10) -> None:
super().__init__()
self.size=10
def __getitem__(self, i):
return torch.Tensor([i]), torch.Tensor([1, i , 2])
def __len__(self):
return self.size
class CustomModel(nn.Module):
def __init__(self):
super().__init__()
self.linear1 = nn.Linear(1, 3)
self.dropout = nn.Dropout(p=0.5)
def forward(self, x):
return self.dropout(self.linear1(x))
def main():
local_process_index = int(os.environ.get("LOCAL_RANK", -1))
dist.init_process_group(backend="nccl",
world_size=2, # Use 2 GPUs
rank=local_process_index)
set_seed(66)
device = torch.device("cuda", local_process_index)
dataset = CustomDataset(size=10)
dataloader = DataLoader(dataset, batch_size=5, shuffle=False,
sampler=DistributedSampler(dataset))
model = DDP(CustomModel().to(device),
device_ids=[local_process_index],
output_device=local_process_index)
model.train()
for input_, output_ in dataloader:
input_, output_ = input_.to(device), output_.to(device)
res = model(input_)
print(f'{res} : {input_}')
if __name__ == '__main__':
main()
-------
Result:
tensor([[ 0.0000, -0.5181, -0.2358],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, -8.9689],
[ 0.2071, 0.0000, 0.0000],
[ 0.0000, -2.4261, 0.0000]], device='cuda:1',
grad_fn=<FusedDropoutBackward>)
tensor([[ 0.0000, -6.2421, -3.5107],
[ 0.0000, 0.0000, 0.0000],
[ 0.0000, 0.0000, -2.4191],
[ 0.2832, 0.0000, 0.0000],
[ 0.0000, -10.0581, 0.0000]], device='cuda:0',
grad_fn=<FusedDropoutBackward>)