Dataset from variable shaped arrays

I have a directory with 5000 numpy files, each containing an array of shape [n, m] where n is the number of sequences of length m. The number of sequences n is different for each file and each sequence constitutes a training sample. So far my solution has been to iterate over every file and create a list containing all the sequences, the list is then fed to __getitem__. This solutions consumes too much memory and slows down training. Is there a more efficient way to build the Dataset?

class SequenceDataset(Data.Dataset):
    def __init__(self, signal_dir):
        file_list = os.listdir(signal_dir)
        self.file_count = len(file_list)
        print('file number:', self.file_count)
        self.file_list = file_list
        self.signal_path = signal_dir

        self.idx = 0
        self.signal_len = self.get_signal_length()

        self.signals_pool,  self.len_signals =self.getpool() 


    def __len__(self):
        return self.len_signals

    def get_signal_length(self):
        signal = np.load(self.signal_path+'/'+self.file_list[0])
        signal_length = signal.shape[1]
        return signal_length
  
    def getpool(self):
        signals = []
        counter = 0
        for f in self.file_list:
            signal = np.load(self.signal_path+'/'+f)
            signals.append(signal)

        signal_array = torch.from_numpy(np.concatenate(signals))
        
        len_signals = signal_array.shape[0]

        return signal_array.unsqueeze(2), len_signals

    def __getitem__(self, item):
        signal = self.signals_pool[item]
        
        return signal

Instead of preloading the data in the __init__ method you could store the paths only and load and process each file lazily in the __getitem__ method which would reduce the memory usage.

1 Like

Thank you! I will try that!