I have a directory with 5000 numpy files, each containing an array of shape [n, m] where n is the number of sequences of length m. The number of sequences n is different for each file and each sequence constitutes a training sample. So far my solution has been to iterate over every file and create a list containing all the sequences, the list is then fed to __getitem__
. This solutions consumes too much memory and slows down training. Is there a more efficient way to build the Dataset?
class SequenceDataset(Data.Dataset):
def __init__(self, signal_dir):
file_list = os.listdir(signal_dir)
self.file_count = len(file_list)
print('file number:', self.file_count)
self.file_list = file_list
self.signal_path = signal_dir
self.idx = 0
self.signal_len = self.get_signal_length()
self.signals_pool, self.len_signals =self.getpool()
def __len__(self):
return self.len_signals
def get_signal_length(self):
signal = np.load(self.signal_path+'/'+self.file_list[0])
signal_length = signal.shape[1]
return signal_length
def getpool(self):
signals = []
counter = 0
for f in self.file_list:
signal = np.load(self.signal_path+'/'+f)
signals.append(signal)
signal_array = torch.from_numpy(np.concatenate(signals))
len_signals = signal_array.shape[0]
return signal_array.unsqueeze(2), len_signals
def __getitem__(self, item):
signal = self.signals_pool[item]
return signal