I have a model that reliably trains to some performance without DDP with a batch size of 2n. I enable DDP, call SyncBatchNorm.convert_sync_batchnorm, use the DistributedSampler, change my batch size to n, and train on two gpus. I get significantly worse results. I wonder why. I take a look at SyncBatchNorm and I wonder if it gives the same results as BatchNorm and I find that it doesn’t. I wonder what I should think about this?
PyTorch 1.11 built from source. Cuda 11. RTX 3090s.
Script:
import os
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.multiprocessing as mp
from torch.nn.parallel import DistributedDataParallel as DDP
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# initialize the process group
dist.init_process_group("gloo", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
def demo_basic(rank, world_size):
setup(rank, world_size)
model = nn.SyncBatchNorm(2).to(rank)
ddp_model = DDP(model, device_ids=[rank])
values = torch.load('values.pt')
values = values.view(2, 1, 2, 4)
values = values[rank].to(rank)
out = ddp_model(values)
torch.save(out.detach(), f'out{rank}.pt')
def run_ddp(demo_fn, world_size):
mp.spawn(demo_fn,
args=(world_size,),
nprocs=world_size,
join=True)
if __name__ == "__main__":
means = [1, 2, 3, 4]
stds = [1, 2, 3, 4]
values = [
torch.normal(m, s, size=(1, 1, 4))
for m, s in zip(means, stds)
]
values = torch.cat(values, dim=1) # (1, 4, 256)
values = values.view(2, 2, 4)
model = nn.BatchNorm1d(2).cuda()
out = model(values.cuda())
torch.save(values, 'values.pt')
run_ddp(demo_basic, 2)
out = out.detach().cpu()
out_rank0 = torch.load('out0.pt', map_location={'cuda:0': 'cpu'})
out_rank1 = torch.load('out1.pt', map_location={'cuda:1': 'cpu'})
out_ddp = torch.cat((out_rank0, out_rank1), dim=0)
print(out)
print(out_ddp)
diff = out - out_ddp
print(diff)
Result:
tensor([[[-0.5313, -0.8621, -0.7267, -0.8753],
[ 0.1433, -1.0897, 1.1616, 1.2525]],
[[ 1.0406, 1.7617, 0.9555, -0.7623],
[-1.6836, -0.1252, -0.5245, 0.8654]]])
tensor([[[-0.5313, -0.8621, -0.7267, -0.8753],
[ 0.1433, -1.0897, 1.1616, 1.2525]],
[[ 1.0406, 1.7617, 0.9555, -0.7623],
[-1.6836, -0.1252, -0.5245, 0.8654]]])
tensor([[[-5.9605e-08, -5.9605e-08, -5.9605e-08, -5.9605e-08],
[ 7.4506e-08, -1.1921e-07, 1.1921e-07, 2.3842e-07]],
[[-1.1921e-07, 1.1921e-07, -1.1921e-07, -5.9605e-08],
[-1.1921e-07, 2.9802e-08, 0.0000e+00, 1.7881e-07]]])
Notice the repeated values in the difference? I wonder if there is some numerical imprecision happening here? I am speaking as someone who was recently burned by TensorFloat32 silently doing half precision matrix multiplies.