Output different with and without FSDP

Hi i tried running a small model with and without FSDP to check if i get the same output when identical input is provided. Below is my code for the same , basically the assert fails when running the code.

import os
import torch
import torch.nn as nn
import torch.nn.functional as F

import torch.distributed as dist
import torch.multiprocessing as mp
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP

def setup(rank, world_size):
os.environ[‘MASTER_ADDR’] = ‘localhost’
os.environ[‘MASTER_PORT’] = ‘12355’

# initialize the process group
dist.init_process_group("nccl", rank=rank, world_size=world_size)

def cleanup():

class Net(nn.Module):
def init(self):
super(Net, self).init()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)

def forward(self, x):

    x = self.conv1(x)
    x = F.relu(x)
    x = self.conv2(x)
    x = F.relu(x)
    x = F.max_pool2d(x, 2)
    x = self.dropout1(x)
    x = torch.flatten(x, 1)
    x = self.fc1(x)
    x = F.relu(x)
    x = self.dropout2(x)
    x = self.fc2(x)
    output = F.log_softmax(x, dim=1)
    return output

def fsdp_main(rank, world_size):

setup(rank, world_size)

model = Net()
inp = torch.rand(4,1,28,28)
out_og = model(inp)

fsdp_model = FSDP(model, device_id=rank)
inp_cuda = inp.to(rank)
out = fsdp_model(inp_cuda)

assert torch.max(torch.abs(out_og - out.cpu())) < 1e-6, f"parallelized and original model have different outputs max diff {torch.max(torch.abs(out_og - out.cpu()))}"

print("ALL Done")

if name == ‘main’:
WORLD_SIZE = torch.cuda.device_count()

I was not expecting this behavior and expected the outputs to be same. I don’t understand why the output is different here?

You have to guarantee the initialization of model parameters and inputs to be the same, e.g. using the same random seeds on all ranks, to achieve identical outcome.