Does DDP make computations non-deterministic?

When I training the model several times by single process,the resulting models are always consistent.
But, when I use DDP to train the model, the modle is different every time. This makes me very confused!

Here are the codes for training:

#!/usr/bin/env python

import sys
import os
import numpy as np
import random
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torchvision import datasets, transforms
from sampler import DistributedSampler

class CNN(nn.Module):
def init(self):
super(CNN, self).init()
self.conv1 = nn.Conv2d(3, 64, 5, padding=1)
self.conv2 = nn.Conv2d(64, 64, 3, padding=1)
self.pool = nn.MaxPool2d(2, 2)
self.conv3 = nn.Conv2d(64, 128, 3, padding=1)
self.conv4 = nn.Conv2d(128, 128, 3, padding=1)
self.fc1 = nn.Linear(128 * 7 * 7, 1024)
self.fc2 = nn.Linear(1024, 10)

def forward(self, x):
x = F.relu(self.conv1(x))
x = F.relu(self.pool(self.conv2(x)))
x = F.relu(self.conv3(x))
x = F.relu(self.pool(self.conv4(x)))
x = x.view(-1, 128 * 7 * 7)
x = F.relu(self.fc1(x))
x = self.fc2(x)
return x

os.environ[“NCCL_IB_DISABLE”] = ‘1’
init_method = sys.argv[1]
world_size = int(sys.argv[2])
rank = int(sys.argv[3])
torch.distributed.init_process_group(
backend=“nccl”,
init_method=init_method,
world_size=world_size,
rank=rank-1)

train_data = datasets.CIFAR10(root=’./data’, train=True, download=True, transform=transforms.ToTensor())
train_sampler = torch.utils.data.distributed.DistributedSampler(train_data, num_replicas=world_size, rank=rank-1)
train_loader = torch.utils.data.DataLoader(train_data, batch_size=64, pin_memory=True,
shuffle=False, num_workers=0, sampler=train_sampler,
worker_init_fn=lambda x: np.random.seed(1))
test_data = datasets.CIFAR10(root=’./data’, train=False, download=True, transform=transforms.ToTensor())
test_sampler = torch.utils.data.distributed.DistributedSampler(test_data, num_replicas=world_size, rank=rank-1)
test_loader = torch.utils.data.DataLoader(test_data, batch_size=64, shuffle=False, num_workers=0, sampler=test_sampler)

torch.manual_seed(2)
torch.cuda.manual_seed_all(2)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
np.random.seed(1)
random.seed(1)

for i in range(torch.cuda.device_count()):
device = torch.device(“cuda:” + str(i))
x = torch.zeros(1)
try:
x = x.to(device)
break
except RuntimeError:
pass

torch.cuda.set_device(device)
model = CNN()
model.to(device)
model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[device])

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
model.train()
for epoch in range(2):
iter = 0
for inputs, labels in train_loader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
outputs = model(inputs)
loss = criterion(outputs, labels)
loss.backward()
optimizer.step()
iter += 1
print(“Epoch=%d iter=%d loss=%.4f” % (epoch, iter, loss.item()))

for name, para in model.named_parameters():
text = name + “:”
para = para.view(-1)
for i in range(min(para.size(0), 32)):
text += " %.10f" % para[i].item()
print(text)

correct, tot = 0, 0
model.eval()
with torch.no_grad():
for inputs, labels in test_loader:
inputs, labels = inputs.to(device), labels.to(device)
outputs = model(inputs)
_, predicted = torch.max(outputs.data, 1)
tot += labels.size(0)
correct += (predicted == labels).sum().item()

print(correct / tot * 100)

Finally,I find that the randomness caused by NCCL rings. Fix the rings and it becomes deterministic.

1 Like

how to fix the rings ?