When my resource is sufficient, why Dataloader worker still been kiilled?

I have adequate memory and cuda memory as I monitor though running, but my program still encounters “RuntimeError: DataLoader worker (pid(s) 7536) exited unexpectedly” sadly. In my test, there are two processes, and one encounter error in about 30 minutes and one in 3 hours.
I use number_workers=16 and pinMemory=True. in my data load pipeline, I use IPL/torchvision/numpy.random, is these libraries may have thread issues?

Below is my data loading related code:

class ImageFolderDataset(Dataset):
    """Dataset representing a folder of images.

    def __init__(self, root: str, is_sort=True, sample_indices=None, first_k=None, repeat=1, cache="none",
                transform: Optional[Callable] = None, 
                target_transform: Optional[Callable] = None):
        self.transform = transform
        self.target_transform = target_transform
        self.repeat = repeat
        self.cache = cache

        if filenames is None:
            filenames = os.listdir(root)

            if sample_indices:
                filenames = filenames[sample_indices]

        if is_sort:
            filenames = sorted(filenames)

        if first_k:
            filenames = filenames[:first_k]

        self.images = []
        for filename in filenames:
            file = os.path.join(root, filename)

            if cache == 'none':
            elif cache == 'in_memory':
                raise NotImplementedError("Not implemented cache scenario, use in_memory.")

    def __getitem__(self, idx):
        # image = Image.open(self.images[idx]).convert("L")  # convert to black and white
        if self.cache == "none":
            image = F.to_tensor(Image.open(self.images[idx % len(self.images)]).convert("RGB"))
        elif self.cache == "in_memory":
            image = self.images[idx % len(self.images)]

        if self.transform:
            image = self.transform(image)

        return image

    def __len__(self):
        return len(self.images) * self.repeat

class ContinuesSRDataset(Dataset):
    """Example dataset class for loading images from folder."""

    def __init__(self, dataset: Dataset,
                scale_range: Union[Union[int, float], Tuple[Union[int, float], Union[int, float]]], 
        self.dataset = dataset
        self.shared_transform = shared_transform
        self.source_transform = source_transform
        self.target_transform = target_transform

        if lr_size is not None:
            if not isinstance(lr_size, (int, Sequence)):
                raise TypeError("lr_size should be int or sequence. Got {}".format(type(lr_size)))
            if isinstance(lr_size, Sequence) and len(lr_size) not in (1, 2):
                raise ValueError("If lr_size is a sequence, it should have 1 or 2 values")

            if isinstance(lr_size, int):
                lr_size = (lr_size, lr_size)
        self.lr_size = lr_size
        if not isinstance(scale_range, (int, float, Sequence)):
            raise TypeError("lr_size should be int or sequence. Got {}".format(type(scale_range)))
        if isinstance(scale_range, Sequence) and len(scale_range) not in (1, 2):
            raise ValueError("If lr_size is a sequence, it should have 1 or 2 values")

        if isinstance(scale_range, (int, float)):
            scale_range = (scale_range, scale_range)

        self.scale_range = scale_range

    def __getitem__(self, idx):
        hr = self.dataset[idx]
        s = random.uniform(self.scale_range[0], self.scale_range[1])
        if self.lr_size:
            h_lr, w_lr = self.lr_size
            h_lr = math.floor(hr.shape[-2] / s + 1e-9)
            w_lr = math.floor(hr.shape[-1] / s + 1e-9)
        h_hr, w_hr = round(h_lr * s), round(w_lr * s)

        rnd_h = random.randint(0, max(0, hr.shape[-2] - h_hr))
        rnd_w = random.randint(0, max(0, hr.shape[-1] - w_hr))

        # left-top corner
        # rnd_h = 0
        # rnd_w = 0
        # right-top corner
        # rnd_h = 0
        # rnd_w = hr.shape[2] - w_hr

        hr = hr[:, rnd_h:rnd_h + h_hr, rnd_w:rnd_w + w_hr]

        #hr = transforms.RandomCrop((h_hr, w_hr))(hr)
        lr = resize_fn(hr, (h_lr, w_lr))
        #lr = transforms.Resize((h_lr, w_lr), interpolation=F.InterpolationMode.BICUBIC)(hr)

        if self.shared_transform:
            lr, hr = self.shared_transform(lr, hr)

        if self.source_transform:
            lr = self.source_transform(lr)
        if self.target_transform:
            hr = self.target_transform(hr)

        return lr, hr

    def __len__(self):
        return len(self.dataset)

class ImplicitPairedImageDataset(Dataset):
    """Example dataset class for loading images from folder."""

    def __init__(self, dataset: Dataset, sample_q = None, extra_hr=False):
        self.dataset = dataset
        self.sample_q = sample_q
        self.extra_hr = extra_hr

    def __getitem__(self, idx):
        lr, hr = self.dataset[idx]

        hr_coord, hr_rgb = ToCoordColorPair()(hr)
        if self.sample_q is not None:
            sample_lst = np.random.choice(
                len(hr_coord), self.sample_q, replace=False)
            hr_coord = hr_coord[sample_lst]
            hr_rgb = hr_rgb[sample_lst]

        gt_size = torch.ones(2).int()
        gt_size[0] = hr.shape[-2]
        gt_size[1] = hr.shape[-1]

        result = {
                'inp': lr,
                'coord': hr_coord,
                'gt': hr_rgb,
                'gt_size': gt_size
        if self.extra_hr:
            result['hr'] = transforms.RandomCrop((lr.shape[-1], lr.shape[-2]))(hr)
        return result

    def __len__(self):
        return len(self.dataset)

It’s a bit hard to tell what might be failing and I would hope the entire stacktrace shows where the issue might come from (e.g. an invalid index or an invalid transformation etc.).
As a debugging step you could use num_workers=0 and check, if this would work. Of course the performance would be lower, but the stacktrace could give you a better error message.

Do you set persistent_workers to True when you construct the DataLoader instance?

I have encountered an error when both persistent_workers and pin_memory are set to True, and num_workers is more than zero. Perhaps you are hitting the same issue?

I don’t remember the exact error message now, but if you are facing the same issue as I did, setting persistent_workers to False may make it go away.