I have adequate memory and cuda memory as I monitor though running, but my program still encounters “RuntimeError: DataLoader worker (pid(s) 7536) exited unexpectedly” sadly. In my test, there are two processes, and one encounter error in about 30 minutes and one in 3 hours.
I use number_workers=16 and pinMemory=True. in my data load pipeline, I use IPL/torchvision/numpy.random, is these libraries may have thread issues?
Below is my data loading related code:
class ImageFolderDataset(Dataset):
"""Dataset representing a folder of images.
Args:
sorted:
sample_indices:
first_k:
filenames:
"""
def __init__(self, root: str, is_sort=True, sample_indices=None, first_k=None, repeat=1, cache="none",
filenames=None,
transform: Optional[Callable] = None,
target_transform: Optional[Callable] = None):
self.transform = transform
self.target_transform = target_transform
self.repeat = repeat
self.cache = cache
if filenames is None:
filenames = os.listdir(root)
if sample_indices:
filenames = filenames[sample_indices]
if is_sort:
filenames = sorted(filenames)
if first_k:
filenames = filenames[:first_k]
self.images = []
for filename in filenames:
file = os.path.join(root, filename)
if cache == 'none':
self.images.append(file)
elif cache == 'in_memory':
self.images.append(F.to_tensor(Image.open(file).convert('RGB')))
else:
raise NotImplementedError("Not implemented cache scenario, use in_memory.")
def __getitem__(self, idx):
# image = Image.open(self.images[idx]).convert("L") # convert to black and white
if self.cache == "none":
image = F.to_tensor(Image.open(self.images[idx % len(self.images)]).convert("RGB"))
elif self.cache == "in_memory":
image = self.images[idx % len(self.images)]
if self.transform:
image = self.transform(image)
return image
def __len__(self):
return len(self.images) * self.repeat
class ContinuesSRDataset(Dataset):
"""Example dataset class for loading images from folder."""
def __init__(self, dataset: Dataset,
scale_range: Union[Union[int, float], Tuple[Union[int, float], Union[int, float]]],
lr_size=None,
shared_transform=None,
source_transform=None,
target_transform=None):
self.dataset = dataset
self.shared_transform = shared_transform
self.source_transform = source_transform
self.target_transform = target_transform
if lr_size is not None:
if not isinstance(lr_size, (int, Sequence)):
raise TypeError("lr_size should be int or sequence. Got {}".format(type(lr_size)))
if isinstance(lr_size, Sequence) and len(lr_size) not in (1, 2):
raise ValueError("If lr_size is a sequence, it should have 1 or 2 values")
if isinstance(lr_size, int):
lr_size = (lr_size, lr_size)
self.lr_size = lr_size
if not isinstance(scale_range, (int, float, Sequence)):
raise TypeError("lr_size should be int or sequence. Got {}".format(type(scale_range)))
if isinstance(scale_range, Sequence) and len(scale_range) not in (1, 2):
raise ValueError("If lr_size is a sequence, it should have 1 or 2 values")
if isinstance(scale_range, (int, float)):
scale_range = (scale_range, scale_range)
self.scale_range = scale_range
def __getitem__(self, idx):
hr = self.dataset[idx]
s = random.uniform(self.scale_range[0], self.scale_range[1])
if self.lr_size:
h_lr, w_lr = self.lr_size
else:
h_lr = math.floor(hr.shape[-2] / s + 1e-9)
w_lr = math.floor(hr.shape[-1] / s + 1e-9)
h_hr, w_hr = round(h_lr * s), round(w_lr * s)
rnd_h = random.randint(0, max(0, hr.shape[-2] - h_hr))
rnd_w = random.randint(0, max(0, hr.shape[-1] - w_hr))
# left-top corner
# rnd_h = 0
# rnd_w = 0
# right-top corner
# rnd_h = 0
# rnd_w = hr.shape[2] - w_hr
hr = hr[:, rnd_h:rnd_h + h_hr, rnd_w:rnd_w + w_hr]
#hr = transforms.RandomCrop((h_hr, w_hr))(hr)
lr = resize_fn(hr, (h_lr, w_lr))
#lr = transforms.Resize((h_lr, w_lr), interpolation=F.InterpolationMode.BICUBIC)(hr)
if self.shared_transform:
lr, hr = self.shared_transform(lr, hr)
if self.source_transform:
lr = self.source_transform(lr)
if self.target_transform:
hr = self.target_transform(hr)
return lr, hr
def __len__(self):
return len(self.dataset)
class ImplicitPairedImageDataset(Dataset):
"""Example dataset class for loading images from folder."""
def __init__(self, dataset: Dataset, sample_q = None, extra_hr=False):
self.dataset = dataset
self.sample_q = sample_q
self.extra_hr = extra_hr
def __getitem__(self, idx):
lr, hr = self.dataset[idx]
hr_coord, hr_rgb = ToCoordColorPair()(hr)
if self.sample_q is not None:
sample_lst = np.random.choice(
len(hr_coord), self.sample_q, replace=False)
hr_coord = hr_coord[sample_lst]
hr_rgb = hr_rgb[sample_lst]
gt_size = torch.ones(2).int()
gt_size[0] = hr.shape[-2]
gt_size[1] = hr.shape[-1]
result = {
'inp': lr,
'coord': hr_coord,
'gt': hr_rgb,
'gt_size': gt_size
}
if self.extra_hr:
result['hr'] = transforms.RandomCrop((lr.shape[-1], lr.shape[-2]))(hr)
return result
def __len__(self):
return len(self.dataset)