I have a 400GB data,but my cpu memory is only 256GB. The first parameter of torch.utils.data.DataLoader is dataset. I fount i still need to load all the data to memory when i create a dataset. following is my code
class SignalDataset(Data.Dataset):
class SignalDataset(Data.Dataset):
def __init__(self, signal_path, label_path, signal_max_len, label_max_len):
with open(signal_path, 'r') as tr:
self.signals = tr.readlines()
with open(label_path, 'r') as te:
self.labels = te.readlines()
self.signal_max_len = signal_max_len
self.label_max_len = label_max_len
@staticmethod
def transform(signal):
'''
:param signal: shape (length,)
:return: shape (length, 1)
'''
signal = np.array([float(v) for v in signal]).reshape(-1,1)
return signal
def __getitem__(self, index):
signal_max_len = self.signal_max_len
label_max_len = self.label_max_len
signal = self.signals[index].strip().split(',')
# str 2 int
label = np.array([int(d) for d in self.labels[index].strip().split(',')])
# str 2 float, reshape
signal = np.array([float(v) for v in signal]).reshape(-1,1)
assert len(signal.shape) == 2
assert signal_max_len - signal.shape[0] >= 0
signal = np.pad(signal, ((0, signal_max_len - signal.shape[0]), (0, 0)),
mode='constant',
constant_values=Constants.SIG_PAD)
assert len(label.shape) == 1
label = np.pad(np.array(label), (1, 1), mode='constant',
constant_values=(Constants.BOS, Constants.EOS))
# label PAD padding
assert label_max_len - label.shape[0] >= 0
label = np.pad(label, (0, label_max_len - label.shape[0]), mode='constant',
constant_values=Constants.PAD)
return signal, label
def __len__(self):
return len(self.labels)