Hi,
I have an issue returning the paths of files from a hdf5 dataset using the dataloader.
I get the following error:
TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found object
It works fine, for dataum and label, which are tensors, however, it does not work for “path”.
An example path stored in the hdf5 dataset:
[‘mults/train/0/5678.ndpi/40x/40x-236247-16635-80640-8704.png’]
Below is the code on the features_dataset class:
import h5py
from torch.utils import data
from torchvision import transforms as T
class Features_Dataset(data.Dataset):
def __init__(self, archive, phase):
self.archive = h5py.File(archive, 'r')
self.labels = self.archive[str(phase) + '_labels']
self.data = self.archive[str(phase) + '_all_arrays']
self.img_paths = self.archive[str(phase) + '_img_paths']
def __getitem__(self, index):
datum = self.data[index]
label = int(self.labels[index])
path = self.img_paths[index]
return datum, label, path
def __len__(self):
return len(self.data)
def close(self):
self.archive.close()
if __name__ == '__main__':
train_dataset = Features_Dataset(archive= "featuresdata/train.hdf5", phase= 'train')
trainloader = data.DataLoader(train_dataset, num_workers=1, batch_size=1)
print(len(trainloader))
for i, (data, label, path) in enumerate(trainloader):
print(path)
Here is an small script of what I did to store the particular paths in the hdf5 dataset:
if sys.version_info >= (3, 0):
string_type = h5py.special_dtype(vlen=str)
else:
string_type = h5py.special_dtype(vlen=unicode) # noqa
labels_shape = (dataset_length, args.batch_size)
path = args.HDF5_dataset + f'{phase}.hdf5'
with h5py.File(path, mode='a') as hdf5_file:
array_paths = hdf5_file.create_dataset(
f'{phase}_img_paths', labels_shape, maxshape=(None, args.batch_size), dtype=string_type)
I’m not really sure what to do as paths are returned fine with alternative dataloaders when reading from. a directory of images…