I use DataLoader to load a big dataset (16 GB). However, I ran out of memory. Is there any suggestion how can we use DataLoader in cases that we don’t have enough memory?
Here’s my code:
class StockDataset(Dataset):
def __init__(self, file_path):
self.data = []
with jsonlines.open(file_path, 'r') as reader:
for item in tqdm(reader, total=2564721, desc="Loading Data"):
self.data.append(item)
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
sample = self.data[idx]
input_ids = torch.tensor([item['Close'] for item in sample['sample']])
label = torch.tensor(sample['label'])
return input_ids, label
class StockDataModule(pl.LightningDataModule):
def __init__(self, train_file, val_file, test_file, batch_size=32):
super().__init__()
self.train_file = train_file
self.val_file = val_file
self.test_file = test_file
self.batch_size = batch_size
def setup(self, stage=None):
self.train_dataset = StockDataset(self.train_file)
self.val_dataset = StockDataset(self.val_file)
self.test_dataset = StockDataset(self.test_file)
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True)
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batch_size)
def test_dataloader(self):
return DataLoader(self.test_dataset, batch_size=self.batch_size)