Hi,
For the data loader in AWS Sagemaker, it runs fine when num_workers = 0, but produces this error when num_workers>0.
Traceback (most recent call last):
File "main.py", line 371, in <module>
g_scaler=g_scaler, d_scaler=d_scaler, runtime_log_folder=runtime_log_folder, runtime_log_file_name=runtime_log_file_name)
File "main.py", line 78, in train_fn
for idx, (x, y) in enumerate(loop):
File "/opt/conda/lib/python3.6/site-packages/tqdm/std.py", line 1171, in __iter__
for obj in iterable:
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 525, in __next__
(data, worker_id) = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1252, in _next_data
return (self._process_data(data), w_id)
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1299, in _process_data
data.reraise()
File "/opt/conda/lib/python3.6/site-packages/torch/_utils.py", line 429, in reraise
raise self.exc_type(msg)
File "/opt/conda/lib/python3.6/site-packages/botocore/exceptions.py", line 84, in __init__
super(HTTPClientError, self).__init__(**kwargs)
File "/opt/conda/lib/python3.6/site-packages/botocore/exceptions.py", line 40, in __init__
msg = self.fmt.format(**kwargs)
KeyError: 'error'
---------------------------------------------------------------------------
UnexpectedStatusException Traceback (most recent call last)
<ipython-input-1-81655136a841> in <module>
58 py_version='py3')
59
---> 60 pytorch_estimator.fit({'train': Runtime.dataset_path}, job_name=Runtime.job_name)
61
62 #print(pytorch_estimator.latest_job_tensorboard_artifacts_path())
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py in fit(self, inputs, wait, logs, job_name, experiment_config)
955 self.jobs.append(self.latest_training_job)
956 if wait:
--> 957 self.latest_training_job.wait(logs=logs)
958
959 def _compilation_job_name(self):
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/estimator.py in wait(self, logs)
1954 # If logs are requested, call logs_for_jobs.
1955 if logs != "None":
-> 1956 self.sagemaker_session.logs_for_job(self.job_name, wait=True, log_type=logs)
1957 else:
1958 self.sagemaker_session.wait_for_job(self.job_name)
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py in logs_for_job(self, job_name, wait, poll, log_type)
3751
3752 if wait:
-> 3753 self._check_job_status(job_name, description, "TrainingJobStatus")
3754 if dot:
3755 print()
~/anaconda3/envs/pytorch_latest_p36/lib/python3.6/site-packages/sagemaker/session.py in _check_job_status(self, job, desc, status_key_name)
3304 ),
3305 allowed_statuses=["Completed", "Stopped"],
-> 3306 actual_status=status,
3307 )
3308
UnexpectedStatusException: Error for Training job 2022-06-03-05-16-49-pix2pix-U12239-2022-05-09-14-39-18-training: Failed. Reason: AlgorithmError: ExecuteUserScriptError:
Command "/opt/conda/bin/python3.6 main.py --runtime_var dataset_name=U12239-2022-05-09-14-39-18,job_name=2022-06-03-05-16-49-pix2pix-U12239-2022-05-09-14-39-18-training,model_name=pix2pix"
0%| | 0/248 [00:00<?, ?it/s]
0%| | 1/248 [00:30<2:07:28, 30.97s/it]
0%| | 1/248 [00:30<2:07:28, 30.97s/it]
Traceback (most recent call last):
File "main.py", line 371, in <module>
g_scaler=g_scaler, d_scaler=d_scaler, runtime_log_folder=runtime_log_folder, runtime_log_file_name=runtime_log_file_name)
File "main.py", line 78, in train_fn
for idx, (x, y) in enumerate(loop):
File "/opt/conda/lib/python3.6/site-packages/tqdm/std.py", line 1171, in __iter__
for obj in iterable:
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 525, in __next__
(data, worker_id) = self._next_data()
File "/opt/conda/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 1252, in _next_data
return (self
I am using a map style dataset from AWS S3, but it seems that the data loader produces error when using multiprocessing data loading, but is fine when single processing data loading.
Does this mean that I should change to an iterable style dataset in order to make multiprocessing data loading work? Or is there a way to make multiprocessing data loading work with a map style dataset like mine?