Hi i tried running a small model with and without FSDP to check if i get the same output when identical input is provided. Below is my code for the same , basically the assert fails when running the code.
import os
import torch
import torch.nn as nn
import torch.nn.functional as Fimport torch.distributed as dist
import torch.multiprocessing as mp
from torch.distributed.fsdp import FullyShardedDataParallel as FSDPdef setup(rank, world_size):
os.environ[‘MASTER_ADDR’] = ‘localhost’
os.environ[‘MASTER_PORT’] = ‘12355’# initialize the process group dist.init_process_group("nccl", rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()class Net(nn.Module):
def init(self):
super(Net, self).init()
self.conv1 = nn.Conv2d(1, 32, 3, 1)
self.conv2 = nn.Conv2d(32, 64, 3, 1)
self.dropout1 = nn.Dropout(0.25)
self.dropout2 = nn.Dropout(0.5)
self.fc1 = nn.Linear(9216, 128)
self.fc2 = nn.Linear(128, 10)def forward(self, x): x = self.conv1(x) x = F.relu(x) x = self.conv2(x) x = F.relu(x) x = F.max_pool2d(x, 2) x = self.dropout1(x) x = torch.flatten(x, 1) x = self.fc1(x) x = F.relu(x) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output
def fsdp_main(rank, world_size):
setup(rank, world_size) model = Net() model.eval() inp = torch.rand(4,1,28,28) out_og = model(inp) fsdp_model = FSDP(model, device_id=rank) fsdp_model.eval() inp_cuda = inp.to(rank) out = fsdp_model(inp_cuda) assert torch.max(torch.abs(out_og - out.cpu())) < 1e-6, f"parallelized and original model have different outputs max diff {torch.max(torch.abs(out_og - out.cpu()))}" print("ALL Done") cleanup()
if name == ‘main’:
WORLD_SIZE = torch.cuda.device_count()
mp.spawn(fsdp_main,
args=(WORLD_SIZE,),
nprocs=WORLD_SIZE,
join=True)
I was not expecting this behavior and expected the outputs to be same. I don’t understand why the output is different here?