Hi, a couple of hours into training a transformer on a small dataset I get a Runtime Error. I use Pytorch Lightning so it might relate to it somehow. Anything I can do to avoid this?
Traceback (most recent call last):
File "/Users/stephan/Library/Mobile Documents/com~apple~CloudDocs/Ablage/AI Master/Courses/2021W/Practical Work in AI/retrosynthesis/src/train.py", line 162, in <module>
main()
File "/Users/stephan/Library/Mobile Documents/com~apple~CloudDocs/Ablage/AI Master/Courses/2021W/Practical Work in AI/retrosynthesis/src/train.py", line 146, in main
trainer.fit(model=model, datamodule=uspto_data)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 740, in fit
self._call_and_handle_interrupt(
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 685, in _call_and_handle_interrupt
return trainer_fn(*args, **kwargs)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 777, in _fit_impl
self._run(model, ckpt_path=ckpt_path)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1199, in _run
self._dispatch()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1279, in _dispatch
self.training_type_plugin.start_training(self)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/plugins/training_type/training_type_plugin.py", line 202, in start_training
self._results = trainer.run_stage()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1289, in run_stage
return self._run_train()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/trainer.py", line 1319, in _run_train
self.fit_loop.run()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/loops/fit_loop.py", line 234, in advance
self.epoch_loop.run(data_fetcher)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/loops/base.py", line 145, in run
self.advance(*args, **kwargs)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/loops/epoch/training_epoch_loop.py", line 156, in advance
batch_idx, (batch, self.batch_progress.is_last_batch) = next(self._dataloader_iter)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/utilities/fetching.py", line 203, in __next__
return self.fetching_function()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/utilities/fetching.py", line 270, in fetching_function
self._fetch_next_batch()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/utilities/fetching.py", line 300, in _fetch_next_batch
batch = next(self.dataloader_iter)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 550, in __next__
return self.request_next_batch(self.loader_iters)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/trainer/supporters.py", line 562, in request_next_batch
return apply_to_collection(loader_iters, Iterator, next)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/utilities/apply_func.py", line 96, in apply_to_collection
return function(data, *args, **kwargs)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 521, in __next__
data = self._next_data()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1203, in _next_data
return self._process_data(data)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1229, in _process_data
data.reraise()
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/_utils.py", line 434, in reraise
raise exception
RuntimeError: Caught RuntimeError in DataLoader worker process 1.
Original Traceback (most recent call last):
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/_utils/worker.py", line 287, in _worker_loop
data = fetcher.fetch(index)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/_utils/fetch.py", line 52, in fetch
return self.collate_fn(data)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/pytorch_lightning/utilities/auto_restart.py", line 474, in _capture_metadata_collate
data = default_collate(samples)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in default_collate
return [default_collate(samples) for samples in transposed]
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 84, in <listcomp>
return [default_collate(samples) for samples in transposed]
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 64, in default_collate
return default_collate([torch.as_tensor(b) for b in batch])
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/utils/data/_utils/collate.py", line 54, in default_collate
storage = elem.storage()._new_shared(numel)
File "/Users/stephan/opt/anaconda3/envs/prinai/lib/python3.8/site-packages/torch/storage.py", line 155, in _new_shared
return cls._new_using_filename(size)
RuntimeError: falseINTERNAL ASSERT FAILED at "/Users/distiller/project/conda/conda-bld/pytorch_1640811925055/work/aten/src/ATen/MapAllocator.cpp":263, please report a bug to PyTorch. unable to open shared memory object </torch_63688_0> in read-write mode
Process finished with exit code 1