I am training a model with DDP and found that my gradients were not being synced after loss.backward() call. After debugging, I found that the reason was because I was manually turning off the gradients of parameters I did not want updated. This somehow prevents the synchronization of the param.grad fields of parameters that I care about. I wonder if this is the intended behavior and if a warning should be thrown because otherwise the training proceeded with no errors.
Interestingly, when I turned on NCCL logs, I saw allreduce operations being logged. I’m hypothesizing that those are related to the allreduce operations for each bucket which precedes the final allreduce that writes to param.grad.
A small toy example to showcase the issue is provided below.
import torch
import torch.nn as nn
from torch.nn.parallel import DistributedDataParallel as DDP
import torch.distributed as dist
import os
def ddp_setup():
dist.init_process_group(backend="nccl")
rank = os.environ["LOCAL_RANK"]
torch.cuda.set_device(int(rank))
class SimpleModel(nn.Module):
def __init__(self, n_layers):
super().__init__()
self.layers = nn.ModuleList([nn.Linear(2, 2) for _ in range(n_layers)])
def forward(self, x):
output = []
for layer in self.layers:
x = layer(x)
output.append(x)
output = torch.stack(output, dim = 0)
return output
def main():
ddp_setup()
rank = dist.get_rank()
torch.manual_seed(420)
# Create two layer linear model and wrap with DDP
model = SimpleModel(2)
model = DDP(model.to(rank), device_ids=[rank], output_device=rank)
# Turn off gradients for later layers because I only need output from first
for name, param in model.named_parameters():
if "1" in name:
param.requires_grad = False
# Pass dummy data through and compute loss from first layer
inp = torch.rand(2) + rank
out = model(inp)
loss = out[0].sum()
loss.backward()
# Log unsynced gradients
for name, param in model.named_parameters():
if "bias" not in name:
print(f"RANK: {rank}, Param: {name}, Grad: {param.grad}")
which I run by
torchrun --standalone --nproc-per-node 2 script.py
with output
RANK: 1, Param: module.layers.0.weight, Grad: tensor([[1.2448, 1.8644],[1.2448, 1.8644]], device='cuda:1')
RANK: 1, Param: module.layers.1.weight, Grad: None
RANK: 0, Param: module.layers.0.weight, Grad: tensor([[0.2448, 0.8644], [0.2448, 0.8644]], device='cuda:0')
RANK: 0, Param: module.layers.1.weight, Grad: None
If I do not turn the param.grad to False, the gradients are synced as expected.