i have a dataset which is about 20G, so i can’t load it directly into RAM.

i create a lmdb database for my data, and i write my own dataset like MNISTdataset in torchvision.

here is my code:

```
from __future__ import print_function
import torch.utils.data as data
# import h5py
import numpy as np
import lmdb
class onlineHCCR(data.Dataset):
def __init__(self, train=True):
# self.root = root
self.train = train
if self.train:
datalmdb_path = 'traindata_lmdb'
labellmdb_path = 'trainlabel_lmdb'
self.data_env = lmdb.open(datalmdb_path, readonly=True)
self.label_env = lmdb.open(labellmdb_path, readonly=True)
else:
datalmdb_path = 'testdata_lmdb'
labellmdb_path = 'testlabel_lmdb'
self.data_env = lmdb.open(datalmdb_path, readonly=True)
self.label_env = lmdb.open(labellmdb_path, readonly=True)
def __getitem__(self, index):
Data = []
Target = []
if self.train:
with self.data_env.begin() as f:
key = '{:08}'.format(index)
data = f.get(key)
flat_data = np.fromstring(data, dtype=float)
data = flat_data.reshape(150, 6).astype('float32')
Data = data
with self.label_env.begin() as f:
key = '{:08}'.format(index)
data = f.get(key)
label = np.fromstring(data, dtype=int)
Target = label[0]
else:
with self.data_env.begin() as f:
key = '{:08}'.format(index)
data = f.get(key)
flat_data = np.fromstring(data, dtype=float)
data = flat_data.reshape(150, 6).astype('float32')
Data = data
with self.label_env.begin() as f:
key = '{:08}'.format(index)
data = f.get(key)
label = np.fromstring(data, dtype=int)
Target = label[0]
return Data, Target
def __len__(self):
if self.train:
return 2693931
else:
return 224589
```

but it seems to be very slow. cause a situation where my GPU utilization is too low like 1%, but it do use 1G GPU memory.

how can i solve this problem? what’s the best pratice to load large datasets in pytorch?