FSDP hangs when using MOE

Hannibal046 · May 18, 2024, 1:59pm

FSDP hangs when combing MoE architecture

opened 11:54AM - 18 May 24 UTC

### 🐛 Describe the bug Hello guys, I am killed by this weird behavior and want… to verify if this is a FSDP bug. Basically, I want to use FSDP to train a MoE model and it hangs after several steps without any error information. I have make a minimal reproducible code based on the official [FSDP tutorial](https://pytorch.org/tutorials/intermediate/FSDP_tutorial.html). Could you help me out? Thanks! ```python # Based on: https://github.com/pytorch/examples/blob/master/mnist/main.py import os import argparse import functools import torch import torch.nn as nn import torch.nn.functional as F import torch.optim as optim from torchvision import datasets, transforms from torch.optim.lr_scheduler import StepLR import torch.distributed as dist import torch.multiprocessing as mp from torch.nn.parallel import DistributedDataParallel as DDP from torch.utils.data.distributed import DistributedSampler from torch.distributed.fsdp import FullyShardedDataParallel as FSDP from torch.distributed.fsdp.fully_sharded_data_parallel import ( CPUOffload, BackwardPrefetch, ) from torch.distributed.fsdp.wrap import _module_wrap_policy from torch.distributed.fsdp.wrap import ( size_based_auto_wrap_policy, enable_wrap, wrap, ) def setup(rank, world_size): os.environ['MASTER_ADDR'] = 'localhost' os.environ['MASTER_PORT'] = '12361' # initialize the process group dist.init_process_group("nccl", rank=rank, world_size=world_size) def cleanup(): dist.destroy_process_group() class Expert(nn.Module): def __init__(self,in_dim,out_dim): super().__init__() self.expert = nn.Linear(in_dim,out_dim) def forward(self,x): return self.expert(x) class Net(nn.Module): def __init__(self): super().__init__() self.conv1 = nn.Conv2d(1, 32, 3, 1) self.conv2 = nn.Conv2d(32, 64, 3, 1) self.dropout1 = nn.Dropout(0.25) self.dropout2 = nn.Dropout(0.5) self.fc1 = nn.Linear(9216, 128) ## MOE self.num_experts = 5 self.topk_experts = 2 self.moe_out_dim = 64 self.experts = nn.ModuleList([Expert(128,self.moe_out_dim) for _ in range(self.num_experts)]) self.projectors = nn.ModuleList([nn.Linear(self.moe_out_dim,self.moe_out_dim) for _ in range(self.num_experts)]) self.gate = nn.Linear(128,self.num_experts) self.fc2 = nn.Linear(64, 10) def moe(self,x): batch_size,d_model = x.shape router_logits = self.gate(x) _routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) routing_weights, selected_experts = torch.topk(_routing_weights, self.topk_experts, dim=-1) routing_weights /= routing_weights.sum(dim=-1, keepdim=True) routing_weights = routing_weights.to(x.dtype) ## [batch_size,top_k] expert_mask = F.one_hot(selected_experts,num_classes=self.num_experts).permute(2,1,0) output = torch.zeros(batch_size,self.moe_out_dim,device=x.device) for idx in range(self.num_experts): top_idx,batch_idx = torch.where(expert_mask[idx]) projector = self.projectors[idx] expert = self.experts[idx] if len(top_idx)>0: selected_x = x[batch_idx] selected_x = expert(selected_x) selected_x = projector(selected_x) * routing_weights[batch_idx,top_idx,None] output.index_add_(0,batch_idx,selected_x) return output,selected_experts def forward(self, x): x = self.conv1(x) x = F.relu(x) x = self.conv2(x) x = F.relu(x) x = F.max_pool2d(x, 2) x = self.dropout1(x) x = torch.flatten(x, 1) x = self.fc1(x) x = F.relu(x) x,selected_experts = self.moe(x) x = self.dropout2(x) x = self.fc2(x) output = F.log_softmax(x, dim=1) return output,selected_experts def train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=None): model.train() ddp_loss = torch.zeros(2).to(rank) if sampler: sampler.set_epoch(epoch) for batch_idx, (data, target) in enumerate(train_loader): data, target = data.to(rank), target.to(rank) optimizer.zero_grad() output, selected_experts = model(data) if rank ==0: print(f"finish {batch_idx} forward, selected expert set = {set(selected_experts.view(-1).tolist())}") loss = F.nll_loss(output, target, reduction='sum') loss.backward() optimizer.step() ddp_loss[0] += loss.item() ddp_loss[1] += len(data) dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) if rank == 0: print('Train Epoch: {} \tLoss: {:.6f}'.format(epoch, ddp_loss[0] / ddp_loss[1])) def test(model, rank, world_size, test_loader): model.eval() correct = 0 ddp_loss = torch.zeros(3).to(rank) with torch.no_grad(): for data, target in test_loader: data, target = data.to(rank), target.to(rank) output = model(data) ddp_loss[0] += F.nll_loss(output, target, reduction='sum').item() # sum up batch loss pred = output.argmax(dim=1, keepdim=True) # get the index of the max log-probability ddp_loss[1] += pred.eq(target.view_as(pred)).sum().item() ddp_loss[2] += len(data) dist.all_reduce(ddp_loss, op=dist.ReduceOp.SUM) if rank == 0: test_loss = ddp_loss[0] / ddp_loss[2] print('Test set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format( test_loss, int(ddp_loss[1]), int(ddp_loss[2]), 100. * ddp_loss[1] / ddp_loss[2])) def fsdp_main(rank, world_size, args): setup(rank, world_size) transform=transforms.Compose([ transforms.ToTensor(), transforms.Normalize((0.1307,), (0.3081,)) ]) dataset1 = datasets.MNIST('./tmp', train=True, download=True, transform=transform) dataset2 = datasets.MNIST('./tmp', train=False, transform=transform) sampler1 = DistributedSampler(dataset1, rank=rank, num_replicas=world_size, shuffle=True) sampler2 = DistributedSampler(dataset2, rank=rank, num_replicas=world_size) train_kwargs = {'batch_size': args.batch_size, 'sampler': sampler1} test_kwargs = {'batch_size': args.test_batch_size, 'sampler': sampler2} cuda_kwargs = {'num_workers': 2, 'pin_memory': True, 'shuffle': False} train_kwargs.update(cuda_kwargs) test_kwargs.update(cuda_kwargs) train_loader = torch.utils.data.DataLoader(dataset1,**train_kwargs) test_loader = torch.utils.data.DataLoader(dataset2, **test_kwargs) my_auto_wrap_policy = functools.partial( size_based_auto_wrap_policy, min_num_params=100 ) wrap_policy = functools.partial( _module_wrap_policy, module_classes={Expert}, ) torch.cuda.set_device(rank) init_start_event = torch.cuda.Event(enable_timing=True) init_end_event = torch.cuda.Event(enable_timing=True) model = Net().to(rank) model = FSDP(model,auto_wrap_policy=wrap_policy) if rank == 0: print(f"{model}") optimizer = optim.Adadelta(model.parameters(), lr=args.lr) scheduler = StepLR(optimizer, step_size=1, gamma=args.gamma) init_start_event.record() for epoch in range(1, args.epochs + 1): train(args, model, rank, world_size, train_loader, optimizer, epoch, sampler=sampler1) test(model, rank, world_size, test_loader) scheduler.step() init_end_event.record() if rank == 0: print(f"CUDA event elapsed time: {init_start_event.elapsed_time(init_end_event) / 1000}sec") print(f"{model}") if args.save_model: # use a barrier to make sure training is done on all ranks dist.barrier() states = model.state_dict() if rank == 0: torch.save(states, "mnist_cnn.pt") cleanup() if __name__ == '__main__': # Training settings parser = argparse.ArgumentParser(description='PyTorch MNIST Example') parser.add_argument('--batch-size', type=int, default=64, metavar='N', help='input batch size for training (default: 64)') parser.add_argument('--test-batch-size', type=int, default=1000, metavar='N', help='input batch size for testing (default: 1000)') parser.add_argument('--epochs', type=int, default=10, metavar='N', help='number of epochs to train (default: 14)') parser.add_argument('--lr', type=float, default=1.0, metavar='LR', help='learning rate (default: 1.0)') parser.add_argument('--gamma', type=float, default=0.7, metavar='M', help='Learning rate step gamma (default: 0.7)') parser.add_argument('--no-cuda', action='store_true', default=False, help='disables CUDA training') parser.add_argument('--seed', type=int, default=1, metavar='S', help='random seed (default: 1)') parser.add_argument('--save-model', action='store_true', default=False, help='For Saving the current Model') args = parser.parse_args() torch.manual_seed(args.seed) WORLD_SIZE = torch.cuda.device_count() mp.spawn(fsdp_main, args=(WORLD_SIZE, args), nprocs=WORLD_SIZE, join=True) ``` This is the output information: ![image](https://github.com/pytorch/pytorch/assets/38466901/1038110a-a271-4b22-b8db-40d70f37129d) ### Versions ``` PyTorch version: 2.3.0+cu121 Is debug build: False CUDA used to build PyTorch: 12.1 ROCM used to build PyTorch: N/A OS: Ubuntu 20.04.6 LTS (x86_64) GCC version: (Ubuntu 9.4.0-1ubuntu1~20.04.2) 9.4.0 Clang version: Could not collect CMake version: version 3.16.3 Libc version: glibc-2.31 Python version: 3.10.14 (main, May 6 2024, 19:42:50) [GCC 11.2.0] (64-bit runtime) Python platform: Linux-5.15.0-1050-azure-x86_64-with-glibc2.31 Is CUDA available: True CUDA runtime version: 12.2.140 CUDA_MODULE_LOADING set to: LAZY GPU models and configuration: GPU 0: NVIDIA A100-SXM4-80GB GPU 1: NVIDIA A100-SXM4-80GB GPU 2: NVIDIA A100-SXM4-80GB GPU 3: NVIDIA A100-SXM4-80GB GPU 4: NVIDIA A100-SXM4-80GB GPU 5: NVIDIA A100-SXM4-80GB GPU 6: NVIDIA A100-SXM4-80GB GPU 7: NVIDIA A100-SXM4-80GB Nvidia driver version: 535.86.10 cuDNN version: Could not collect HIP runtime version: N/A MIOpen runtime version: N/A Is XNNPACK available: True CPU: Architecture: x86_64 CPU op-mode(s): 32-bit, 64-bit Byte Order: Little Endian Address sizes: 48 bits physical, 48 bits virtual CPU(s): 96 On-line CPU(s) list: 0-95 Thread(s) per core: 1 Core(s) per socket: 48 Socket(s): 2 NUMA node(s): 4 Vendor ID: AuthenticAMD CPU family: 23 Model: 49 Model name: AMD EPYC 7V12 64-Core Processor Stepping: 0 CPU MHz: 3276.212 BogoMIPS: 4890.88 Hypervisor vendor: Microsoft Virtualization type: full L1d cache: 3 MiB L1i cache: 3 MiB L2 cache: 48 MiB L3 cache: 384 MiB NUMA node0 CPU(s): 0-23 NUMA node1 CPU(s): 24-47 NUMA node2 CPU(s): 48-71 NUMA node3 CPU(s): 72-95 Vulnerability Gather data sampling: Not affected Vulnerability Itlb multihit: Not affected Vulnerability L1tf: Not affected Vulnerability Mds: Not affected Vulnerability Meltdown: Not affected Vulnerability Mmio stale data: Not affected Vulnerability Retbleed: Mitigation; untrained return thunk; SMT disabled Vulnerability Spec rstack overflow: Mitigation; safe RET, no microcode Vulnerability Spec store bypass: Mitigation; Speculative Store Bypass disabled via prctl and seccomp Vulnerability Spectre v1: Mitigation; usercopy/swapgs barriers and __user pointer sanitization Vulnerability Spectre v2: Mitigation; Retpolines, STIBP disabled, RSB filling, PBRSB-eIBRS Not affected Vulnerability Srbds: Not affected Vulnerability Tsx async abort: Not affected Flags: fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ht syscall nx mmxext fxsr_opt pdpe1gb rdtscp lm constant_tsc rep_good nopl tsc_reliable nonstop_tsc cpuid extd_apicid aperfmperf pni pclmulqdq ssse3 fma cx16 sse4_1 sse4_2 movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm cmp_legacy cr8_legacy abm sse4a misalignsse 3dnowprefetch osvw topoext perfctr_core ssbd vmmcall fsgsbase bmi1 avx2 smep bmi2 rdseed adx smap clflushopt clwb sha_ni xsaveopt xsavec xgetbv1 clzero xsaveerptr rdpru arat umip rdpid Versions of relevant libraries: [pip3] mypy-extensions==1.0.0 [pip3] numpy==1.26.4 [pip3] torch==2.3.0 [pip3] torchaudio==2.3.0 [pip3] torchvision==0.18.0 [pip3] triton==2.3.0 [conda] numpy 1.26.4 pypi_0 pypi [conda] torch 2.3.0 pypi_0 pypi [conda] torchaudio 2.3.0 pypi_0 pypi [conda] torchvision 0.18.0 pypi_0 pypi [conda] triton 2.3.0 pypi_0 pypi ```

Hannibal046 · May 19, 2024, 8:48am

Does anyone experienced similar problem?

agu · June 5, 2024, 11:38pm

Self-resolved in FSDP hangs when combing MoE architecture · Issue #126616 · pytorch/pytorch · GitHub