I am trying to load data from a zip file by Python zipfile library. However, it seems that it cannot compatible with the torch’s Dataloader class.
import numpy as np
import cv2
import io
from torch.utils.data import DataLoader, Dataset
from torchvision.transforms import ToTensor
import zipfile
class ZipDataset(Dataset):
def __init__(self, root_path, cache_into_memory=False):
if cache_into_memory:
f = open(root_path, 'rb')
self.zip_content = f.read()
f.close()
self.zip_file = zipfile.ZipFile(io.BytesIO(self.zip_content), 'r')
else:
self.zip_file = zipfile.ZipFile(root_path, 'r')
self.name_list = list(filter(lambda x: x[-4:] == '.jpg', self.zip_file.namelist()))
self.to_tensor = ToTensor()
def __getitem__(self, key):
buf = self.zip_file.read(name=self.name_list[key])
img = self.to_tensor(cv2.imdecode(np.fromstring(buf, dtype=np.uint8), cv2.IMREAD_COLOR))
return img
def __len__(self):
return len(self.name_list)
if __name__ == '__main__':
dataset = ZipDataset('COCO.zip', cache_into_memory=False)
dataloader = DataLoader(dataset, batch_size=2, num_workers=2)
for batch_idx, sample in enumerate(dataloader):
print(batch_idx, sample.size())
When num_workers=0 or num_workers=1, everything works well. But if the num_workers is larger than 1, the program will raise a strange error:
Traceback (most recent call last):
File "test_zip_file.py", line 31, in <module>
for batch_idx, sample in enumerate(dataloader):
File "/home/admin/anaconda3/envs/pytorch1_0/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 637, in __next__
return self._process_next_batch(batch)
File "/home/admin/anaconda3/envs/pytorch1_0/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 658, in _process_next_batch
raise batch.exc_type(batch.exc_msg)
zipfile.BadZipFile: Traceback (most recent call last):
File "/home/admin/anaconda3/envs/pytorch1_0/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in _worker_loop
samples = collate_fn([dataset[i] for i in batch_indices])
File "/home/admin/anaconda3/envs/pytorch1_0/lib/python3.6/site-packages/torch/utils/data/dataloader.py", line 138, in <listcomp>
samples = collate_fn([dataset[i] for i in batch_indices])
File "test_zip_file.py", line 21, in __getitem__
buf = self.zip_file.read(name=self.name_list[key])
File "/home/admin/anaconda3/envs/pytorch1_0/lib/python3.6/zipfile.py", line 1337, in read
with self.open(name, "r", pwd) as fp:
File "/home/admin/anaconda3/envs/pytorch1_0/lib/python3.6/zipfile.py", line 1419, in open
% (zinfo.orig_filename, fname))
zipfile.BadZipFile: File name in directory '000000000009.1.jpg' and header b'\x00(\xa2\x8a\x00(\xa2\x8a\x00(\xa2\x8a\x00(\xa2\x8a\x00(\xa2\x8a\x00(\xa2\x8a\x00(\xa2\x8a\x00(\... ' differ.
It looks like that zipfile cannot be read in the multiprocessing manner. But interestingly, if the we set cache_into_memory=True (which means that the total zip file will be read into memory), the program will work fine.
This code has been tested in Windows 10 / Ubuntu 16.04, torch 0.41 & 1.0.0. All of them have the same results.