Hi everyone
I am loading large numpy arrays of over >1GB in size. In total i have around 1,00,000 such files. But the issue is when i am trying to load them in the dataloader i am not getting a progress bar. I used tqdm on the dataloader but it is not showing the progress bar.
This is what i have done
class CustomDataset(Dataset):
def __init__(self, source_path, target_path, transform=None): self.source_data = [np.load(path, allow_pickle=True, mmap_mode='r') for path in source_path] self.target_data = [np.load(path, allow_pickle=True, mmap_mode='r') for path in target_path] self.start_indices = [0] * len(source_path) self.data_count_source = 0 self.data_count_target = 0 for index, memmap in enumerate(self.source_data): self.start_indices[index] = self.data_count_source self.data_count += memmap.shape[0] for index, memmap in enumerate(self.target_data): self.start_indices[index] = self.data_count_target self.data_count += memmap.shape[0] def __len__(self): return self.data_count_source def __getitem__(self, index): memmap_index_source = bisect(self.start_indices, index) - 1 index_in_memmap_source = index - self.start_indices[memmap_index_source] source_data = self.source_data[memmap_index_source][index_in_memmap_source] memmap_index_target = bisect(self.start_indices, index) - 1 index_in_memmap_target = index - self.start_indices[memmap_index_target] target_data = self.target_data[memmap_index_target][index_in_memmap_target] return source_data, target_data
root_dir = ‘en-hi/’
source_path, target_path = os.path.join(root_dir, ‘train’, ‘source’), os.path.join(root_dir, ‘train’, ‘target’)
train_source_file = glob.glob(source_path + ‘/.pt.npy’)
train_target_file = glob.glob(target_path + '/.pt.npy’)
train_dataset = CustomDataset(train_source_file, train_target_file)
train_dataloader = DataLoader(train_dataset, batch_size=1, drop_last=False, num_workers=0)
pbar = tqdm(train_dataloader)
for d in pbar:
pass
Is there any way to track the update of loading the numpy arrays? Thank you for any kind of help