Here is the error message and screenshot just before OOM occurs:
Traceback (most recent call last):
File "run_train.py", line 557, in <module>
main()
File "run_train.py", line 499, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/transformers/trainer.py", line 1645, in train
return inner_training_loop(
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/transformers/trainer.py", line 1929, in _inner_training_loop
tr_loss_step = self.training_step(model, inputs)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/transformers/trainer.py", line 2751, in training_step
loss = self.compute_loss(model, inputs)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/transformers/trainer.py", line 2780, in compute_loss
outputs = model(**inputs)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/modules/module.py", line 1501, in _call_impl
return forward_call(*args, **kwargs)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 172, in forward
return self.gather(outputs, self.output_device)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/data_parallel.py", line 184, in gather
return gather(outputs, output_device, dim=self.dim)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 86, in gather
res = gather_map(outputs)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 77, in gather_map
return type(out)((k, gather_map([d[k] for d in outputs]))
File "<string>", line 12, in __init__
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/transformers/utils/generic.py", line 277, in __post_init__
for idx, element in enumerate(iterator):
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 77, in <genexpr>
return type(out)((k, gather_map([d[k] for d in outputs]))
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 81, in gather_map
return type(out)(map(gather_map, zip(*outputs)))
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/scatter_gather.py", line 71, in gather_map
return Gather.apply(target_device, dim, *outputs)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/autograd/function.py", line 506, in apply
return super().apply(*args, **kwargs) # type: ignore[misc]
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/_functions.py", line 75, in forward
return comm.gather(inputs, ctx.dim, ctx.target_device)
File "/home/wayne/dev/pop-repos/model-train/wve/lib/python3.8/site-packages/torch/nn/parallel/comm.py", line 235, in gather
return torch._C._gather(tensors, dim, destination)
torch.cuda.OutOfMemoryError: CUDA out of memory. Tried to allocate 92.00 MiB (GPU 0; 23.69 GiB total capacity; 20.63 GiB already allocated; 92.31 MiB free; 22.22 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation. See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF
0%|