I’m currently doing multi-GPUs training on 4 Tesla T4 via Pytorch distributed DDP module.
The point is that after some iteration i will get the following traceback that do not interrupt the application.
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Traceback (most recent call last):
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 300, in _run_finalizers
finalizer()
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/util.py", line 224, in __call__
res = self._callback(*self._args, **self._kwargs)
File "/opt/anaconda/anaconda3/lib/python3.8/multiprocessing/synchronize.py", line 87, in _cleanup
sem_unlink(name)
FileNotFoundError: [Errno 2] No such file or directory
Moreover at the end I’ll get the following additional worning:
UserWarning: resource_tracker: There appear to be 98 leaked semaphore objects to clean up at shutdown
Currently my train DDP function is the following one:
def train_ddp(rank: int, world_size: int, params: Dict[str, Any], epochs: int, conn: connection.Connection) -> None:
initialize_preocess(rank, world_size)
train_results = [torch.zeros((8, epochs), device=rank) for _ in range(world_size)]
test_results = [torch.zeros(4, device=rank) for _ in range(world_size)]
model = ResNet_Weird(BasicBlock, [2, 2, 2, 2], image_size=params['image_size'], num_classes=params['num_classes'], n_channels=params['n_channels']).to(rank)
model = DDP(model, device_ids=[rank], output_device=rank, find_unused_parameters=True)
params['model'] = model
num_workers = int(os.environ['SLURM_CPUS_PER_TASK'])
params['train_dl'] = DataLoader(
params['train_ds'], batch_size=params['batch_size'],
sampler=DistributedSampler(params['train_ds'], num_replicas=world_size, rank=rank, shuffle=True, seed=100001),
shuffle=False, pin_memory=True, persistent_workers=True,
num_workers=num_workers
)
params['val_dl'] = DataLoader(
params['val_ds'], batch_size=params['batch_size'],
sampler=DistributedSampler(params['val_ds'], num_replicas=world_size, rank=rank, shuffle=False, seed=100001),
shuffle=False, pin_memory=True, persistent_workers=True,
num_workers=num_workers
)
params['test_dl'] = DataLoader(
params['test_ds'], batch_size=params['batch_size'],
sampler=DistributedSampler(params['test_ds'], num_replicas=world_size, rank=rank, shuffle=False, seed=100001),
shuffle=False, pin_memory=True, persistent_workers=True,
num_workers=num_workers
)
train_test = TrainWorker(rank, params, world_size)
if rank == 0:
dist.gather(train_test.train_evaluate(epochs), train_results)
dist.gather(train_test.test(), test_results)
else:
dist.gather(train_test.train_evaluate(epochs))
dist.gather(train_test.test())
#dist.barrier()
if rank == 0:
train_results = (torch.sum(torch.stack(train_results), dim=0) / world_size).cpu().tolist()
test_results = (torch.sum(torch.stack(test_results), dim=0) / world_size).cpu().tolist()
conn.send((train_results, test_results))
#dist.barrier()
destroy_process_group()
As you can see I’ve already tried to set multiple barriers but the problem were still present.