Hi,
I’m new to PyTorch and having some issues loading continuous/numerical data properly using a dataloader, while also mean-centering and scaling the data to unit variance.
I’m following a similar code format to what I’ve used for image data previously (e.g., CNN for MNIST), but definitely having issues with properly setting/implementing my custom transform function to this continuous/numerical data. I’d appreciate anyone’s input on this matter!
My dataset currently has 60 samples and 46 features (independent variables) that will be used to predict 1 continuous target variable (for now, could also be multiple target variables down the line) using a feed-forward/ANN type model. Future datasets could also have thousands of samples…
My code is as follows below, but I keep getting errors associated with multiple input types (ndarray vs. tensor), unsupported operand type(s), etc. depending on what I attempt.
I think really all I need is for my custom transform function to properly mean-center and scale the data (as well as transform to tensor of course) and I’d like to print out a couple test batches of my training and validation data so that I can make sure it’s working properly before I build the ANN model class. I should also mention that the code seems to be working just fine in terms of the train/validation splits, etc. → it ultimately breaks down when trying to pass the transform function.
class CSVDataset(Dataset):
# Load the dataset.
def __init__(self, file_path, train, transform=None):
# Read the data into a dataframe.
df = pd.read_csv(file_path)
# Store the input & output variables.
self.train_X = df.iloc[:, 1:-1].values.astype('float32')
self.train_Y = df.iloc[:, -1].values.astype('float32')
self.test_X = df.iloc[:, 1:-1].values.astype('float32')
self.test_Y = np.empty(len(self.test_X))
self.transform = transform
self.train = train
def __getitem__(self, index):
if self.train is True:
X_data = self.train_X[index]
Y_data = self.train_Y[index]
if self.transform is not None:
X_data = self.transform(X_data)
Y_data = self.transform(Y_data)
return X_data, Y_data
else:
X_data = self.test_X[index]
if self.transform is not None:
X_data = self.transform(X_data)
return X_data
def __len__(self):
if self.train is True:
return len(self.train_Y)
else:
return len(self.test_Y)
def train_val_dataset(dataset, val_split=0.2):
train_idx, val_idx = train_test_split(list(range(len(dataset))), \
test_size=val_split, shuffle=True, random_state=42)
train_val_splits = {}
train_val_splits['train'] = Subset(dataset, train_idx)
train_val_splits['val'] = Subset(dataset, val_idx)
return train_val_splits
def test_dataset(dataset):
test_idx = list(range(len(dataset)))
test_split = {}
test_split['test'] = Subset(dataset, test_idx)
return test_split
################################################################################################
class StandardScaler():
"""Standardize data by removing the mean and scaling to unit variance.
This object can be used as a transform in PyTorch data loaders.
Args:
mean (FloatTensor): The mean value for each feature in the data.
scale (FloatTensor): Per-feature relative scaling.
"""
def __init__(self, mean=None, scale=None):
if mean is not None:
mean = torch.FloatTensor(mean)
if scale is not None:
scale = torch.FloatTensor(scale)
self.mean_ = mean
self.scale_ = scale
def fit(self, sample):
"""Set the mean and scale values based on the sample data.
"""
self.mean_ = sample.mean(0, keepdim=True)
self.scale_ = sample.std(1, unbiased=False, keepdim=True)
return self
def __call__(self, sample):
return (sample - self.mean_)/self.scale_
def inverse_transform(self, sample):
"""Scale the data back to the original sample space.
"""
return sample * self.scale_ + self.mean_
################################################################################################
# Create transform function to pass into CSVDataset class.
# transform = transforms.Compose([transforms.ToTensor(), StandardScaler(mean=0, scale=1)])
transform = StandardScaler()
"""
transform = StandardScaler()
transform = StandardScaler(mean=0, scale=1)
"""
# Test CSVDataset class definition & print train/test dataset sizes.
dataset = CSVDataset(file_path, train=True, transform=transform)
print(dataset.train_X)
print()
print(dataset.train_Y)
print()
print(dataset.test_X)
print()
# Print the length of the entire passed dataset.
print("Dataset Length:", len(dataset))
print()
# Split into train/validation sets & print respective lengths.
train_val_splits = train_val_dataset(dataset)
print("Train Set Length:", len(train_val_splits['train']))
print("Val Set Length:", len(train_val_splits['val']))
print()
# Pass train/validation splits into PyTorch DataLoader functions.
def cycle(iterable):
while True:
for x in iterable:
yield x
train_loader = DataLoader(train_val_splits['train'], batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
train_iter = iter(cycle(train_loader))
val_loader = DataLoader(train_val_splits['val'], batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
val_iter = iter(cycle(val_loader))
# Acquire & print test batch of training/validation data.
dataiter = iter(train_loader)
train_batch = dataiter.next()
print(train_batch)
print()
dataiter2 = iter(val_loader)
val_batch = dataiter2.next()
print(val_batch)
print()
Thanks again!
Cheers