My code runs well, but it always finishes the training with an error:
...
[2021-04-24 13:45:52] -- DEBUG: val>>>[94/94-500] ips-9.1, loss-0.2466, liou-0.1080, l1-0.0061, miou-0.89
[2021-04-24 13:45:52] -- DEBUG: Training is done!
free(): invalid pointer
Traceback (most recent call last):
File "/home/space/Public/anaconda3/envs/pytorch17/lib/python3.8/runpy.py", line 194, in _run_module_as_main
return _run_code(code, main_globals, None,
File "/home/space/Public/anaconda3/envs/pytorch17/lib/python3.8/runpy.py", line 87, in _run_code
exec(code, run_globals)
File "/home/space/Public/anaconda3/envs/pytorch17/lib/python3.8/site-packages/torch/distributed/launch.py", line 260, in <module>
main()
File "/home/space/Public/anaconda3/envs/pytorch17/lib/python3.8/site-packages/torch/distributed/launch.py", line 255, in main
raise subprocess.CalledProcessError(returncode=process.returncode,
subprocess.CalledProcessError: Command '['/home/space/Public/anaconda3/envs/pytorch17/bin/python', '-u', 'train_filter_s.py', '--local_rank=1']' died with <Signals.SIGABRT: 6>.
Although there is no impact on my work, I still want to remove this error message.
part of my code:
if 'RANK' in os.environ and 'WORLD_SIZE' in os.environ:
args.world_size = int(os.environ['WORLD_SIZE'])
args.rank = int(os.environ["RANK"])
args.local_rank = int(os.environ['LOCAL_RANK'])
elif 'SLURM_PROCID' in os.environ:
args.rank = int(os.environ['SLURM_PROCID'])
args.local_rank = args.rank % torch.cuda.device_count()
args.master_addr = str(os.environ['MASTER_ADDR']) if 'MASTER_ADDR' in os.environ else '????'
args.master_port = str(os.environ['MASTER_PORT']) if 'MASTER_PORT' in os.environ else '????'
print('| distributed init (rank {} local {}) -- master://{}:{}'.format(
args.rank, args.local_rank, args.master_addr, args.master_port), flush=True)
args.distributed = True
args.device = 'cuda'
torch.cuda.set_device(args.local_rank)
torch.distributed.init_process_group(backend='nccl', init_method='env://') # env -- read from environ
torch.distributed.barrier()
setup_for_distributed(args.rank == 0)
...
...
for epoch in range(args.start_epoch, args.epochs):
args.current_epoch = epoch
# train
if args.distributed:
sampler_train.set_epoch(args.current_epoch)
train_one_epoch(args, model, optimizer, loader_train, logger, writer_train)
# change learning rate
lr_scheduler.step()
# save checkpoint
if (args.current_epoch + 1) % args.save_interval == 0:
save_checkpoint(args, model_without_ddp, optimizer, lr_scheduler)
# validate
if (args.current_epoch + 1) % args.val_interval == 0 and val_flag:
if args.distributed:
sampler_val.set_epoch(args.current_epoch)
validate(args, model, loader_val, logger, writer_val)
# cleanup
if args.distributed:
torch.distributed.destroy_process_group()