For those trying to load text data efficiently, you could leverage linecache
and subprocess
.
This works for the case when we have one huge file, lets say 100+ GB, where every row is one training example.
class LazyTextDataset(Dataset):
def __init__(self, filename):
self._filename = filename
self._total_data = 0
self._total_data = int(subprocess.check_output("wc -l " + filename, shell=True).split()[0])
def __getitem__(self, idx):
line = linecache.getline(self._filename, idx + 1)
csv_line = csv.reader([line])
return next(csv_line)
def __len__(self):
return self._total_data