Hi. I’ve tried to check DDP synchronization on two GPUs with the simple model - and failed.
I used sniplet from the tutorial, with just a small change: I used random inputs, but labels were not random, I used local rank as a labels.
To check model behavior, I printed model output on each step, supposing that:
- if synchronization works, they should be similar (around 0.5 for two GPUs)
- without synchronization, I’ll get values around 0 for cuda:0 and near 1 for cuda:1
It appeared, the second case is true:
on device 0, step 0 output is 0.05
on device 1, step 0 output is 0.13
on device 0, step 100 output is -0.12
on device 1, step 100 output is 0.48
on device 0, step 200 output is 0.08
on device 1, step 200 output is 0.52
on device 0, step 300 output is 0.10
on device 1, step 300 output is 0.86
on device 0, step 400 output is 0.13
on device 1, step 400 output is 1.15
Why???
import os
import torch
import torch.nn as nn
import torch.distributed as dist
import torch.multiprocessing as mp
import torch.optim as optim
input_size = 10
output_size = 1
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.fc = nn.Linear(input_size, input_size)
self.relu = nn.ReLU()
self.fc1 = nn.Linear(input_size, output_size)
def forward(self, input):
_x0 = self.fc(input)
_x1 = self.relu(_x0)
output = self.fc1(_x1)
return output
def worker(rank):
dist.init_process_group("nccl", rank=rank, world_size=2)
torch.cuda.set_device(rank)
print("init model")
model = Model().cuda()
print("init ddp on rank", rank)
ddp_model = torch.nn.parallel.DistributedDataParallel(model, device_ids=[rank])
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()
inp = torch.randn(10, 10).cuda()
print("train on rank", rank)
for _i in range(4000):
output = ddp_model(inp)
label = (torch.ones(output_size) * rank).to(rank)
loss = loss_fn(output, label)
l1 = loss.detach().cpu().numpy()
loss.backward()
# for param in model.parameters():
# dist.all_reduce(param.grad.data,
# op=dist.reduce_op.SUM,
# async_op = False)
# param.grad.data /= 2
optimizer.step()
if ( _i % 100 == 0):
print("on device %d, step %4d output is %5.2f" % (rank, _i, output.detach().mean().cpu().numpy()))
if __name__ == "__main__":
os.environ["MASTER_ADDR"] = "localhost"
os.environ["MASTER_PORT"] = "29501"
n_gpus = torch.cuda.device_count()
mp.spawn(worker, nprocs=n_gpus, args=())