Too many open files caused by persistent_workers and pin_memory

Rodrigo_Bragato_Piva · December 8, 2023, 10:20pm

Having some trouble running PyTorch and Lightning with Optuna. After a few trials I run into the following error:

OSError: [Errno 24] Too many open files

I’ve assembled a script based on Optuna’s example of a PyTorch implementation that very consistently runs into this error on my machine.

"""
Optuna example that optimizes multi-layer perceptrons using PyTorch Lightning.

In this example, we optimize the validation accuracy of fashion product recognition using
PyTorch Lightning, and FashionMNIST. We optimize the neural network architecture. As it is too time
consuming to use the whole FashionMNIST dataset, we here use a small subset of it.

You can run this example as follows, pruning can be turned on and off with the `--pruning`
argument.
	$ python pytorch_lightning_simple.py [--pruning]

"""

from torch.utils.data import random_split
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import datasets
from torch import optim
from torch import nn

import torch.nn.functional as F
import pytorch_lightning as pl
import optuna
import torch


from typing import List, Optional
from packaging import version

import errno
import gc
import os


PERCENT_VALID_EXAMPLES = 0.1
NUM_WORKERS = 16
BATCHSIZE = 128
CLASSES = 10
EPOCHS = 1
DIR = os.getcwd()

PERSISTENT_WORKERS = True
PIN_MEMORY = True

def print_debug():
	ret = {}
	base = '/proc/self/fd'
	for num in os.listdir(base):
		path = None
		try:
			path = os.readlink(os.path.join(base, num))
		except OSError as err:
			# Last FD is always the "listdir" one (which may be closed)
			if err.errno != errno.ENOENT:
				raise
		ret[int(num)] = path
	for k, v in ret.items():
		print(k, v)

def print_simple_debug(id):
	ret = {}
	base = '/proc/self/fd'
	for num in os.listdir(base):
		path = None
		try:
			path = os.readlink(os.path.join(base, num))
		except OSError as err:
			# Last FD is always the "listdir" one (which may be closed)
			if err.errno != errno.ENOENT:
				raise
		ret[int(num)] = path

	print(11*'#')
	print(f"## ID:{id:02d} ##")
	print(f"## {len(ret):05d} ##")
	print(11*'#')


class Net(nn.Module):
	def __init__(self, dropout: float, output_dims: List[int]) -> None:
		super().__init__()
		layers: List[nn.Module] = []

		input_dim: int = 28 * 28
		for output_dim in output_dims:
			layers.append(nn.Linear(input_dim, output_dim))
			layers.append(nn.ReLU())
			layers.append(nn.Dropout(dropout))
			input_dim = output_dim

		layers.append(nn.Linear(input_dim, CLASSES))

		self.layers = nn.Sequential(*layers)

	def forward(self, data: torch.Tensor) -> torch.Tensor:
		logits = self.layers(data)
		return F.log_softmax(logits, dim=1)


class LightningNet(pl.LightningModule):
	def __init__(self, dropout: float, output_dims: List[int]) -> None:
		super().__init__()
		self.model = Net(dropout, output_dims)

	def forward(self, data: torch.Tensor) -> torch.Tensor:
		return self.model(data.view(-1, 28 * 28))

	def training_step(self, batch: List[torch.Tensor], batch_idx: int) -> torch.Tensor:
		data, target = batch
		output = self(data)
		return F.nll_loss(output, target)

	def validation_step(self, batch: List[torch.Tensor], batch_idx: int) -> None:
		data, target = batch
		output = self(data)
		pred = output.argmax(dim=1, keepdim=True)
		accuracy = pred.eq(target.view_as(pred)).float().mean()
		self.log("val_acc", accuracy)
		self.log("hp_metric", accuracy, on_step=False, on_epoch=True)

	def configure_optimizers(self) -> optim.Optimizer:
		return optim.Adam(self.model.parameters())


class FashionMNISTDataModule(pl.LightningDataModule):
	def __init__(self, data_dir: str, batch_size: int, num_workers: int, pin_memory: bool, persistent_workers: bool):
		super().__init__()
		self.persistent_workers = persistent_workers
		self.num_workers = num_workers
		self.pin_memory = pin_memory
		self.batch_size = batch_size
		self.data_dir = data_dir

	def setup(self, stage: Optional[str] = None) -> None:
		self.mnist_test = datasets.FashionMNIST(
			self.data_dir, train=False, download=True, transform=transforms.ToTensor()
		)
		mnist_full = datasets.FashionMNIST(
			self.data_dir, train=True, download=True, transform=transforms.ToTensor()
		)
		self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])

	def train_dataloader(self) -> DataLoader:
		return DataLoader(
			self.mnist_train,
			batch_size=self.batch_size,
			num_workers=self.num_workers,
			persistent_workers=self.persistent_workers,
			pin_memory=self.pin_memory,
			shuffle=False
		)

	def val_dataloader(self) -> DataLoader:
		return DataLoader(
			self.mnist_val,
			batch_size=self.batch_size,
			num_workers=self.num_workers,
			persistent_workers=self.persistent_workers,
			pin_memory=self.pin_memory,
			shuffle=False
		)

	def test_dataloader(self) -> DataLoader:
		return DataLoader(
			self.mnist_test,
			batch_size=self.batch_size,
			num_workers=self.num_workers,
			persistent_workers=self.persistent_workers,
			pin_memory=self.pin_memory,
			shuffle=False
		)


def objective(trial: optuna.trial.Trial) -> float:
	# We optimize the number of layers, hidden units in each layer and dropouts.
	n_layers = trial.suggest_int("n_layers", 1, 3)
	dropout = trial.suggest_float("dropout", 0.2, 0.5)
	output_dims = [
		trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True) for i in range(n_layers)
	]
	hyperparameters = dict(n_layers=n_layers, dropout=dropout, output_dims=output_dims)

	trainer = pl.Trainer(
		limit_val_batches=PERCENT_VALID_EXAMPLES,
		enable_checkpointing=False,
		accelerator="auto",
		max_epochs=EPOCHS,
		logger=True,
		devices=1
	)
	datamodule = FashionMNISTDataModule(
		persistent_workers=PERSISTENT_WORKERS,
		num_workers=NUM_WORKERS,
		pin_memory=PIN_MEMORY,
		batch_size=BATCHSIZE,
		data_dir=DIR
	)
	model = LightningNet(
		dropout,
		output_dims
	)

	trainer.logger.log_hyperparams(hyperparameters)
	
	print_simple_debug(0)
	trainer.fit(model, datamodule=datamodule)
	print_simple_debug(1)

	gc.collect()


	return trainer.callback_metrics["val_acc"].item()


if __name__ == "__main__":
	torch.multiprocessing.set_sharing_strategy('file_system')
	torch.set_float32_matmul_precision('medium')

	study = optuna.create_study(direction="maximize")
	study.optimize(objective, n_trials=100)

	print("Number of finished trials: {}".format(len(study.trials)))

	print("Best trial:")
	trial = study.best_trial

	print("  Value: {}".format(trial.value))

	print("  Params: ")
	for key, value in trial.params.items():
		print("    {}: {}".format(key, value))

When both PERSISTENT_WORKERS and PIN_MEMORY are set to True I get the error, if either or both are set to False the code runs without issue. pin_memory didn’t have a big impact on performance for me, so I’m running my code without it.

The print_simple_debug function prints how many files are open by python, a number that increases leading to the eventual crash. The print_debug() shows what files are open, but the files from this weird bug are all named ‘pipe [NUMBER]’ so it hasn’t helped much.

I think it might be related to the multi-threading of the dataloader iterator, as a higher number of workers leads to a quicker crash and the flag persistent_workers is needed, but have not been able to decipher the precise cause of this issue.

Running this on Ubuntu 22 with PyTorch 2.0.1, PyTorchLightning 2.1.2, Optuna 3.4.0

TTK95 · January 26, 2024, 11:13am

I have the exact same problem. But also noted, that even when I set persistent_worker false and use multiple workers it will crash.
I now try it with only one worker, but even there it looks like it files up my RAM slowly.
My Dataloader loads images from disk and processes them and feeds them to my model. After each fit run I do gc and even delete my datamodule, but nothing seems to release the RAM or kill the processes from the Dataloader.

dcherm02 · March 18, 2024, 8:53am

I can reproduce a similar error, though my conditions are a little different.

Regardless of the status of persistent_workers and pin_memory, the pipeline crashes if I set early stopping callbacks (PyTorchLightningPruningCallback). It appears that when pruning poorly optimized runs the files don’t get properly released causing this build up of files over time. I am not actually getting a crash with the pin_memory or persistent_workers set to true. Perhaps an upgrade to PyTorch, PyTorchLightning and Optuna will help you.

Ubuntu 22.04.2, PyTorch 2.2.0, PyTorchLightning 2.2.0.post0, Optuna 3.5.0