Hello. I have been trying to debug this issue for a while but have not gotten very far. I have achieved multi-gpu training (with DDP) through the following launch file:
import torch
import torch.multiprocessing as mp
import torch._dynamo as dynamo
import torch.distributed as dist
import os
import sys
import random
def setup_distribute(rank, world_size):
os.environ['MASTER_ADDR'] = 'localhost'
os.environ['MASTER_PORT'] = '12355'
dist.init_process_group("nccl", rank = rank, world_size = world_size)
def cleanup_distribute():
dist.destroy_process_group()
def dist_wrapper(rank, world_size, func, name: str, SEED: int):
setup_distribute(rank, world_size)
torch.cuda.set_device(rank)
set_dynamo_cfg()
reset_deterministic(SEED)
try:
func(rank, world_size, name)
finally:
cleanup_distribute()
def set_dynamo_cfg():
dynamo.config.capture_scalar_outputs = True
dynamo.config.cache_size_limit = 256
dynamo.config.guard_nn_modules = True
dynamo.config.suppress_errors = True
def set_non_deterministic():
torch.backends.cudnn.benchmark = True
def reset_deterministic(SEED):
import numpy as np
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True
def main(func, name: str, SEED: int):
world_size = torch.cuda.device_count()
mp.spawn(dist_wrapper, args=(world_size, func, name, SEED), nprocs=world_size, join=True)
This all works fine (note that DistributedSampler is also seeded properly). However, I start to run into issues with forward hooks. Specifically, when i attach the following hooks:
def _activation_hook(self, module, input, output: torch.Tensor) -> None:
self.act_w.append(output.clone().detach().cpu().cuda().to(torch.float64).mean(dim = 0).view(-1))
return
def _fake_activation_hook(self, module, input, output: torch.Tensor) -> None:
self.act_w.append(F.relu(output.clone().detach().cpu().cuda()).to(torch.float64).mean(dim = 0).view(-1))
Through testing, I have noticed that the gradients are slightly different across instances with the hooks and without them.
Note that the device transfers part is not working either. It was inspired by noticing that if I just transfered everything to cpu, it would remain deterministic. However, without any device transfers or with the one to cpu and back to cuda, it is still not deterministic. The rest of the training (like initialization and data) is still remaining deterministic.
Furthermore, if I run it with and without the hook, calling a simple torch.randint(0, 100000, (1,)) (on either device), this still remains the same. Can anyone help me out?