I am trying to train mlagents on an aws instance. I have loaded up the nvidia drivers and I’m able to run one since environment of training. On my personal computer I can run multiple. When I try to run multiple environments on the aws instance I always get a different error, but something to do with it running out of memory or unable to multiprocess. here is the error for running with 2 envs. It starts training for 1 second before crashing:
Traceback (most recent call last):
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\queues.py”, line 239, in _feed
obj = _ForkingPickler.dumps(obj)
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\reduction.py”, line 51, in dumps
cls(buf, protocol).dump(obj)
MemoryError
Traceback (most recent call last):
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\trainer_controller.py”, line 176, in start_learning
n_steps = self.advance(env_manager)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents_envs\timers.py”, line 305, in wrapped
return func(*args, **kwargs)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\trainer_controller.py”, line 234, in advance
new_step_infos = env_manager.get_steps()
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\env_manager.py”, line 124, in get_steps
new_step_infos = self._step()
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\subprocess_env_manager.py”, line 417, in _step
step: EnvironmentResponse = self.step_queue.get_nowait()
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\queues.py”, line 129, in get_nowait
return self.get(False)
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\queues.py”, line 111, in get
res = self._recv_bytes()
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\connection.py”, line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\connection.py”, line 318, in _recv_bytes
return self._get_more_data(ov, maxsize)
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\connection.py”, line 340, in _get_more_data
ov, err = _winapi.ReadFile(self._handle, left, overlapped=True)
MemoryError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\learn.py”, line 132, in run_training
tc.start_learning(env_manager)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents_envs\timers.py”, line 305, in wrapped
return func(*args, **kwargs)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\trainer_controller.py”, line 201, in start_learning
self._save_models()
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents_envs\timers.py”, line 305, in wrapped
return func(*args, **kwargs)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\trainer_controller.py”, line 80, in _save_models
self.trainers[brain_name].save_model()
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\ghost\trainer.py”, line 333, in save_model
self.trainer.save_model()
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\trainer\rl_trainer.py”, line 185, in save_model
model_checkpoint = self._checkpoint()
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents_envs\timers.py”, line 305, in wrapped
return func(*args, **kwargs)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\trainer\rl_trainer.py”, line 157, in checkpoint
export_path, auxillary_paths = self.model_saver.save_checkpoint(
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\model_saver\torch_model_saver.py”, line 60, in save_checkpoint
self.export(checkpoint_path, behavior_name)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\model_saver\torch_model_saver.py”, line 65, in export
self.exporter.export_policy_model(output_filepath)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\torch\model_serialization.py”, line 164, in export_policy_model
torch.onnx.export(
File "C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\torch\onnx_init.py", line 271, in export
return utils.export(model, args, f, export_params, verbose, training,
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\torch\onnx\utils.py”, line 88, in export
_export(model, args, f, export_params, verbose, training, input_names, output_names,
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\torch\onnx\utils.py”, line 709, in _export
proto, export_map = graph._export_onnx(
MemoryError: bad allocation
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\runpy.py”, line 193, in _run_module_as_main
return _run_code(code, main_globals, None,
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\runpy.py”, line 86, in run_code
exec(code, run_globals)
File "C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\Scripts\mlagents-learn.exe_main.py", line 7, in
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\learn.py”, line 260, in main
run_cli(parse_command_line())
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\learn.py”, line 256, in run_cli
run_training(run_seed, options, num_areas)
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\learn.py”, line 134, in run_training
env_manager.close()
File “C:\Users\Administrator\Desktop\maxkcystuff\pyawsjulvenv\lib\site-packages\mlagents\trainers\subprocess_env_manager.py”, line 489, in close
step: EnvironmentResponse = self.step_queue.get_nowait()
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\queues.py”, line 129, in get_nowait
return self.get(False)
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\queues.py”, line 111, in get
res = self._recv_bytes()
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\connection.py”, line 216, in recv_bytes
buf = self._recv_bytes(maxlength)
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\connection.py”, line 318, in _recv_bytes
return self._get_more_data(ov, maxsize)
File “C:\Users\Administrator\AppData\Local\Programs\Python\Python38\lib\multiprocessing\connection.py”, line 340, in _get_more_data
ov, err = _winapi.ReadFile(self._handle, left, overlapped=True)
MemoryError