Having some trouble running PyTorch and Lightning with Optuna. After a few trials I run into the following error:
OSError: [Errno 24] Too many open files
I’ve assembled a script based on Optuna’s example of a PyTorch implementation that very consistently runs into this error on my machine.
"""
Optuna example that optimizes multi-layer perceptrons using PyTorch Lightning.
In this example, we optimize the validation accuracy of fashion product recognition using
PyTorch Lightning, and FashionMNIST. We optimize the neural network architecture. As it is too time
consuming to use the whole FashionMNIST dataset, we here use a small subset of it.
You can run this example as follows, pruning can be turned on and off with the `--pruning`
argument.
$ python pytorch_lightning_simple.py [--pruning]
"""
from torch.utils.data import random_split
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import datasets
from torch import optim
from torch import nn
import torch.nn.functional as F
import pytorch_lightning as pl
import optuna
import torch
from typing import List, Optional
from packaging import version
import errno
import gc
import os
PERCENT_VALID_EXAMPLES = 0.1
NUM_WORKERS = 16
BATCHSIZE = 128
CLASSES = 10
EPOCHS = 1
DIR = os.getcwd()
PERSISTENT_WORKERS = True
PIN_MEMORY = True
def print_debug():
ret = {}
base = '/proc/self/fd'
for num in os.listdir(base):
path = None
try:
path = os.readlink(os.path.join(base, num))
except OSError as err:
# Last FD is always the "listdir" one (which may be closed)
if err.errno != errno.ENOENT:
raise
ret[int(num)] = path
for k, v in ret.items():
print(k, v)
def print_simple_debug(id):
ret = {}
base = '/proc/self/fd'
for num in os.listdir(base):
path = None
try:
path = os.readlink(os.path.join(base, num))
except OSError as err:
# Last FD is always the "listdir" one (which may be closed)
if err.errno != errno.ENOENT:
raise
ret[int(num)] = path
print(11*'#')
print(f"## ID:{id:02d} ##")
print(f"## {len(ret):05d} ##")
print(11*'#')
class Net(nn.Module):
def __init__(self, dropout: float, output_dims: List[int]) -> None:
super().__init__()
layers: List[nn.Module] = []
input_dim: int = 28 * 28
for output_dim in output_dims:
layers.append(nn.Linear(input_dim, output_dim))
layers.append(nn.ReLU())
layers.append(nn.Dropout(dropout))
input_dim = output_dim
layers.append(nn.Linear(input_dim, CLASSES))
self.layers = nn.Sequential(*layers)
def forward(self, data: torch.Tensor) -> torch.Tensor:
logits = self.layers(data)
return F.log_softmax(logits, dim=1)
class LightningNet(pl.LightningModule):
def __init__(self, dropout: float, output_dims: List[int]) -> None:
super().__init__()
self.model = Net(dropout, output_dims)
def forward(self, data: torch.Tensor) -> torch.Tensor:
return self.model(data.view(-1, 28 * 28))
def training_step(self, batch: List[torch.Tensor], batch_idx: int) -> torch.Tensor:
data, target = batch
output = self(data)
return F.nll_loss(output, target)
def validation_step(self, batch: List[torch.Tensor], batch_idx: int) -> None:
data, target = batch
output = self(data)
pred = output.argmax(dim=1, keepdim=True)
accuracy = pred.eq(target.view_as(pred)).float().mean()
self.log("val_acc", accuracy)
self.log("hp_metric", accuracy, on_step=False, on_epoch=True)
def configure_optimizers(self) -> optim.Optimizer:
return optim.Adam(self.model.parameters())
class FashionMNISTDataModule(pl.LightningDataModule):
def __init__(self, data_dir: str, batch_size: int, num_workers: int, pin_memory: bool, persistent_workers: bool):
super().__init__()
self.persistent_workers = persistent_workers
self.num_workers = num_workers
self.pin_memory = pin_memory
self.batch_size = batch_size
self.data_dir = data_dir
def setup(self, stage: Optional[str] = None) -> None:
self.mnist_test = datasets.FashionMNIST(
self.data_dir, train=False, download=True, transform=transforms.ToTensor()
)
mnist_full = datasets.FashionMNIST(
self.data_dir, train=True, download=True, transform=transforms.ToTensor()
)
self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
def train_dataloader(self) -> DataLoader:
return DataLoader(
self.mnist_train,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=self.persistent_workers,
pin_memory=self.pin_memory,
shuffle=False
)
def val_dataloader(self) -> DataLoader:
return DataLoader(
self.mnist_val,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=self.persistent_workers,
pin_memory=self.pin_memory,
shuffle=False
)
def test_dataloader(self) -> DataLoader:
return DataLoader(
self.mnist_test,
batch_size=self.batch_size,
num_workers=self.num_workers,
persistent_workers=self.persistent_workers,
pin_memory=self.pin_memory,
shuffle=False
)
def objective(trial: optuna.trial.Trial) -> float:
# We optimize the number of layers, hidden units in each layer and dropouts.
n_layers = trial.suggest_int("n_layers", 1, 3)
dropout = trial.suggest_float("dropout", 0.2, 0.5)
output_dims = [
trial.suggest_int("n_units_l{}".format(i), 4, 128, log=True) for i in range(n_layers)
]
hyperparameters = dict(n_layers=n_layers, dropout=dropout, output_dims=output_dims)
trainer = pl.Trainer(
limit_val_batches=PERCENT_VALID_EXAMPLES,
enable_checkpointing=False,
accelerator="auto",
max_epochs=EPOCHS,
logger=True,
devices=1
)
datamodule = FashionMNISTDataModule(
persistent_workers=PERSISTENT_WORKERS,
num_workers=NUM_WORKERS,
pin_memory=PIN_MEMORY,
batch_size=BATCHSIZE,
data_dir=DIR
)
model = LightningNet(
dropout,
output_dims
)
trainer.logger.log_hyperparams(hyperparameters)
print_simple_debug(0)
trainer.fit(model, datamodule=datamodule)
print_simple_debug(1)
gc.collect()
return trainer.callback_metrics["val_acc"].item()
if __name__ == "__main__":
torch.multiprocessing.set_sharing_strategy('file_system')
torch.set_float32_matmul_precision('medium')
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=100)
print("Number of finished trials: {}".format(len(study.trials)))
print("Best trial:")
trial = study.best_trial
print(" Value: {}".format(trial.value))
print(" Params: ")
for key, value in trial.params.items():
print(" {}: {}".format(key, value))
When both PERSISTENT_WORKERS and PIN_MEMORY are set to True I get the error, if either or both are set to False the code runs without issue. pin_memory didn’t have a big impact on performance for me, so I’m running my code without it.
The print_simple_debug function prints how many files are open by python, a number that increases leading to the eventual crash. The print_debug() shows what files are open, but the files from this weird bug are all named ‘pipe [NUMBER]’ so it hasn’t helped much.
I think it might be related to the multi-threading of the dataloader iterator, as a higher number of workers leads to a quicker crash and the flag persistent_workers is needed, but have not been able to decipher the precise cause of this issue.
Running this on Ubuntu 22 with PyTorch 2.0.1, PyTorchLightning 2.1.2, Optuna 3.4.0