I am using the example from Initialize DDP with torch.distributed.run/torchrun. I add a dict state_info
as an additional input to the forward function, which will track the state of each forward call. However, the dict is not updated in DDP. The codes are:
import torch
import torch.distributed as dist
import torch.nn as nn
import torch.optim as optim
from torch.nn.parallel import DistributedDataParallel as DDP
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x, state_info):
state_info['tmp'] = 10 # additional input that tracks the state
return self.net2(self.relu(self.net1(x)))
def demo_basic():
dist.init_process_group("nccl")
rank = dist.get_rank()
print(f"Start running basic DDP example on rank {rank}.")
# create model and move it to GPU with id rank
device_id = rank % torch.cuda.device_count()
model = ToyModel().to(device_id)
ddp_model = DDP(model, device_ids=[device_id])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001)
optimizer.zero_grad()
state_info = {'tmp': 0}
print(state_info)
outputs = ddp_model(torch.randn(20, 10), state_info)
labels = torch.randn(20, 5).to(device_id)
loss_fn(outputs, labels).backward()
optimizer.step()
print(state_info) # The dict is not updated here
if __name__ == "__main__":
demo_basic()
For the second print
function, state_info
is not changed, i.e. not changed to 10. When DDP is not used, state_info
is changed. Is it a feature of DDP? Is there any way to solve it?