Hello! The error I asked 2 days ago was solved. But I’ve encountered a new error which is similar to the error in this page and I don’t have an idea about how to solve it …Here is the code:
def pre_train_and_fine_tune(rank: int, world_size: int, hyperparams: dict, times: mp.Queue):
ddp_setup(rank, world_size)
training_mode = "pre_train"
configs = modify_the_configuration(training_mode, hyperparams)
train_or_fine_tune_or_test(rank, training_mode, configs)
training_mode = "fine_tune"
configs = modify_the_configuration(training_mode, hyperparams)
ts = train_or_fine_tune_or_test(rank, training_mode, configs)
times.put(ts) # This line of code is where the error is reported
destroy_process_group()
if __name__ == '__main__':
# adjust_parameters_in_a_given_range()
best_acc = float(77.0)
while True:
# MODIFIED
remove_previous_model_parameters()
hyperparams = dict(
lr = rand_float(0.001, 0.003),
batch_size = random.randrange(256, 512+1, step=64),
target_batch_size = random.randrange(32, 64, step=8),
temperature = rand_float(0.01, 0.02),
weight_decay = rand_float(0.00020, 0.0003),
num_epoch = random.randrange(60, 70, 2),
lam = rand_float(0.740, 0.764),
kernel_size = random.choice([3, 4, 5, 6, 7, 8]),
)
world_size = torch.cuda.device_count()
times = mp.Queue(maxsize=4)
mp.spawn(pre_train_and_fine_tune, args=(world_size, hyperparams, times), nprocs=world_size)
training_mode = "test"
And here is my traceback:
Traceback (most recent call last):
File “/home/…/.conda/envs/lrw/lib/python3.10/runpy.py”, line 196, in _run_module_as_main
return _run_code(code, main_globals, None,
File “/home/…/.conda/envs/lrw/lib/python3.10/runpy.py”, line 86, in _run_code
exec(code, run_globals)
File “/home/…/bundled/libs/debugpy/adapter/…/…/debugpy/launcher/…/…/debugpy/main.py”, line 39, in
cli.main()
File “/home/…/bundled/libs/debugpy/adapter/…/…/debugpy/launcher/…/…/debugpy/…/debugpy/server/cli.py”, line 430, in main
run()
File “/home/…/bundled/libs/debugpy/adapter/…/…/debugpy/launcher/…/…/debugpy/…/debugpy/server/cli.py”, line 284, in run_file
runpy.run_path(target, run_name=“main”)
File “/home/…/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py”, line 321, in run_path
return _run_module_code(code, init_globals, run_name,
File “/home/…/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py”, line 135, in _run_module_code
_run_code(code, mod_globals, init_globals,
File “/home/…/bundled/libs/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py”, line 124, in _run_code
exec(code, run_globals)
File “/home/…/TFC-pretraining-main/code/TFC/main_modified.py”, line 573, in
mp.spawn(pre_train_and_fine_tune, args=(world_size, hyperparams, times), nprocs=world_size)
File “/home/…/.conda/envs/lrw/lib/python3.10/site-packages/torch/multiprocessing/spawn.py”, line 246, in spawn
return start_processes(fn, args, nprocs, join, daemon, start_method=“spawn”)
File “/home/…/.conda/envs/lrw/lib/python3.10/site-packages/torch/multiprocessing/spawn.py”, line 202, in start_processes
while not context.join():
File “/home/…/.conda/envs/lrw/lib/python3.10/site-packages/torch/multiprocessing/spawn.py”, line 145, in join
raise ProcessExitedException(
torch.multiprocessing.spawn.ProcessExitedException: process 0 terminated with signal SIGSEGV
Your help will be appreciated. Thanks a lot!