It doesn’t seem to be related to DDP or pytorch, but to how logging module is setup. If you remove all the torch code, you would still get the same result.
Hi @agolynski, thank you so much for your kind reply. I have adjusted my code and found that the logger works very well if it is created inside the process of DDP but fails again if it was fed as argument. The following snippets can evidence my statement:
def setup(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
# Initialize the process group.
dist.init_process_group('NCCL', rank=rank, world_size=world_size)
def cleanup():
dist.destroy_process_group()
class ToyModel(nn.Module):
def __init__(self):
super(ToyModel, self).__init__()
self.net1 = nn.Linear(10, 10)
self.relu = nn.ReLU()
self.net2 = nn.Linear(10, 5)
def forward(self, x):
return self.net2(self.relu(self.net1(x)))
def demo_basic(rank, world_size, logger=None):
setup(rank, world_size)
if rank == 0:
logger = get_logger() if logger is None else logger
logger.info(f'info in process')
logger.error(f'error in process.')
# Create model and move it to GPU.
model = ToyModel().to(rank)
ddp_model = DDP(model, device_ids=[rank])
loss_fn = nn.MSELoss()
optimizer = optim.SGD(ddp_model.parameters(), lr=0.001) # optimizer takes DDP model.
optimizer.zero_grad()
inputs = torch.randn(20, 10) # .to(rank)
outputs = ddp_model(inputs)
labels = torch.randn(20, 5).to(rank)
loss_fn(outputs, labels).backward()
optimizer.step()
cleanup()
def run_demo(demo_func, world_size):
logger = get_logger()
# logger = None # Created from inside.
mp.spawn(
demo_func,
args=(world_size, logger),
nprocs=world_size,
join=True
)
def get_logger():
logger = logging.getLogger('train')
# Handlers.
logger.addHandler(
logging.StreamHandler()
)
logger.setLevel(logging.DEBUG)
return logger
def example2():
run_demo(demo_basic, 4)
def main():
example2()
if __name__ == "__main__":
main()
If the code in line 54 is commented (as above), there is no “info in process”. However, if we commented line 53 and uncommented line 54, we can see “info in process” in the output.
It does not block me, but I am quite curious why it happens, I thought DDP is essentially a wrapper of process.