Dataloader for multiple files with sliding window

I am working on a problem where I have multiple CSVs files and I need to read those multiple CSVs one by one with a sliding window. Let’s assume that, one CSV file is having 330 data points and the window size is 32 so we should be having (10*32 = 320) and the last 10 points will be discarded.

I started making a dataset that looks like this but after spending too much time, I am not able to get it working. The current code looks like this,

class CustomDataset(Dataset):
    def __init__(self, data_folder, window_size):
        self.data_folder = data_folder
        self.data_file_list = [file for file in os.listdir(data_folder)]
        print(self.data_file_list)
        self.window_size = window_size
        
    def __len__(self):
        return len(self.data_file_list[0])

    def __getitem__(self, idx):
        filename = self.data_file_list[idx]
        data, label = read_file(filename)
        return data, label
    
    def read_file(self, filename):
        data = pd.read_csv(filename)
        data = data.drop(["file_name", "class_name"], axis = 1)
        features = data.drop(["class_no"], axis = 1)
        labels = data["class_no"]
        x = [features[index:index+self.window_size].values for index in range(0, len(features))]
        y = [labels[index:index+self.window_size].values for index in range(0, len(labels))]
        
        return x, y

Note: I can’t merge all these CSV files into one.

I am getting this error,

TypeError: object of type ‘type’ has no len().

Something similar to this?

from sklearn.datasets import load_diabetes, load_iris, load_breast_cancer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

class SimpleDataset(Dataset):
def init(self, features, target, name):
self.name = name
self.features = features
self.target = target
self.len = len(target)

def __len__(self):
    return self.len

def __getitem__(self, idx):
    features = self.features[idx]
    target = self.target[idx]
    return features, target

def dataloader_generator(dataset_list):
for aux in dataset_list:
bunch = aux()
dataset = SimpleDataset(bunch.data, bunch.target, str(aux))
dataloader = DataLoader(dataset=dataset, batch_size=73, drop_last=True)
yield dataloader

dataset_list = [load_diabetes, load_iris, load_breast_cancer]

for epoch in range(2):
print(f’Epoch: {epoch}‘)
for dl in dataloader_generator(dataset_list):
print(f’—> Name: {dl.dataset.name} Cases: {len(dl.dataset)} ’
f’Batches: {len(dl)} Dropped: {len(dl.dataset) - len(dl) * 73}‘)
for features, target in dl:
print(f’------> Batch dimensions: {(features.size(), target.size())}')

Epoch: 0
—> Name: <function load_diabetes at 0x000001CDBE43DA60> Cases: 442 Batches: 6 Dropped: 4
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
—> Name: <function load_iris at 0x000001CDBE43D8B0> Cases: 150 Batches: 2 Dropped: 4
------> Batch dimensions: (torch.Size([73, 4]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 4]), torch.Size([73]))
—> Name: <function load_breast_cancer at 0x000001CDBE43D940> Cases: 569 Batches: 7 Dropped: 58
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
Epoch: 1
—> Name: <function load_diabetes at 0x000001CDBE43DA60> Cases: 442 Batches: 6 Dropped: 4
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 10]), torch.Size([73]))
—> Name: <function load_iris at 0x000001CDBE43D8B0> Cases: 150 Batches: 2 Dropped: 4
------> Batch dimensions: (torch.Size([73, 4]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 4]), torch.Size([73]))
—> Name: <function load_breast_cancer at 0x000001CDBE43D940> Cases: 569 Batches: 7 Dropped: 58
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))
------> Batch dimensions: (torch.Size([73, 30]), torch.Size([73]))

your __len__() method should return the total number of features. Right know you are returning the number of files you have in the folder and therefore you will only access one sliding window per file. Unfortunaly there are a lot of problems following from there…The most easy solution is to load all the files and then write all the data into a single list and continue from there. It makes sense to load everything first because loading from files is most of the time very slow.

Here’s another version of the same suggestion.

#!/usr/bin/env python

coding: utf-8

In[1]:

from sklearn.datasets import load_diabetes, load_iris, load_breast_cancer

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import numpy as np
import pandas as pd

In[2]:

I will store these toy datasets in an HDD as if they were large

files that do not fit in a PC’s memory (but fit in an HDD/SDD).

Target values are in the first column.

dataset_dict = {
‘diabetes’: load_diabetes,
‘iris’: load_iris,
‘breast_cancer’: load_breast_cancer
}

for name, aux in dataset_dict.items():
aux = aux()
aux = pd.DataFrame(data=np.hstack((aux.target.reshape(-1,1), aux.data)),
columns=[‘Y’] + [‘X’ + str(i) for i in range(aux.data.shape[1])])
aux.to_csv(name + ‘.csv’, index=False)

In[80]:

class SimpleDataset(Dataset):
def init(self, features, target, size, name):
self.name = name
self.features = features
self.target = target
self.len = size

def __len__(self):
    return self.len

def __getitem__(self, idx):
    features = self.features[idx]
    target = self.target[idx]
    return features, target

def dataloader_generator(dataset_dict):
max_in_mem = 200 # Max number of instances in memory (g.t.e. to batch sizes)
for name, info in dataset_dict.items():
file_path = info[‘file path’]
batch_size = info[‘batch size’]
norm_chunksize = batch_size * (max_in_mem // batch_size)
with pd.read_csv(file_path, chunksize=norm_chunksize) as reader:
while True:
try:
chunk = next(reader)
aux = chunk.to_numpy()
dataset = SimpleDataset(aux[:, 1:], aux[:, 0], chunk.shape[0], name)
dataloader = DataLoader(dataset=dataset, batch_size=batch_size, drop_last=True)
yield dataloader
except:
break

In[81]:

A dictionary with names, file names/paths, and batch sizes

dataset_dict = {
‘diabetes’: {
‘file path’:‘diabetes.csv’,
‘batch size’: 200
},
‘iris’: {
‘file path’:‘iris.csv’,
‘batch size’: 150
},
‘breast cancer’: {
‘file path’:‘breast_cancer.csv’,
‘batch size’: 80
}
}

In[82]:

Running it for 1 epoch

for epoch in range(1):
for dl_idx, dl in enumerate(dataloader_generator(dataset_dict)):
for features, target in dl:
print((dl_idx, dl.dataset.name, dl.dataset.len, features.shape, target.shape))

(0, ‘diabetes’, 200, torch.Size([200, 10]), torch.Size([200]))
(1, ‘diabetes’, 200, torch.Size([200, 10]), torch.Size([200]))
(3, ‘iris’, 150, torch.Size([150, 4]), torch.Size([150]))
(4, ‘breast cancer’, 160, torch.Size([80, 30]), torch.Size([80]))
(4, ‘breast cancer’, 160, torch.Size([80, 30]), torch.Size([80]))
(5, ‘breast cancer’, 160, torch.Size([80, 30]), torch.Size([80]))
(5, ‘breast cancer’, 160, torch.Size([80, 30]), torch.Size([80]))
(6, ‘breast cancer’, 160, torch.Size([80, 30]), torch.Size([80]))
(6, ‘breast cancer’, 160, torch.Size([80, 30]), torch.Size([80]))
(7, ‘breast cancer’, 89, torch.Size([80, 30]), torch.Size([80]))