Loading continuous numerical data

Hi,

I’m new to PyTorch and having some issues loading continuous/numerical data properly using a dataloader, while also mean-centering and scaling the data to unit variance.

I’m following a similar code format to what I’ve used for image data previously (e.g., CNN for MNIST), but definitely having issues with properly setting/implementing my custom transform function to this continuous/numerical data. I’d appreciate anyone’s input on this matter!

My dataset currently has 60 samples and 46 features (independent variables) that will be used to predict 1 continuous target variable (for now, could also be multiple target variables down the line) using a feed-forward/ANN type model. Future datasets could also have thousands of samples…

My code is as follows below, but I keep getting errors associated with multiple input types (ndarray vs. tensor), unsupported operand type(s), etc. depending on what I attempt.

I think really all I need is for my custom transform function to properly mean-center and scale the data (as well as transform to tensor of course) and I’d like to print out a couple test batches of my training and validation data so that I can make sure it’s working properly before I build the ANN model class. I should also mention that the code seems to be working just fine in terms of the train/validation splits, etc. → it ultimately breaks down when trying to pass the transform function.

class CSVDataset(Dataset):
    # Load the dataset.
    def __init__(self, file_path, train, transform=None):
        # Read the data into a dataframe.
        df = pd.read_csv(file_path)
        # Store the input & output variables.
        self.train_X = df.iloc[:, 1:-1].values.astype('float32')
        self.train_Y = df.iloc[:, -1].values.astype('float32')
        self.test_X = df.iloc[:, 1:-1].values.astype('float32')
        self.test_Y = np.empty(len(self.test_X))
        self.transform = transform
        self.train = train

    def __getitem__(self, index):
        if self.train is True:
            X_data = self.train_X[index]
            Y_data = self.train_Y[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
                Y_data = self.transform(Y_data)
            return X_data, Y_data

        else:
            X_data = self.test_X[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
            return X_data

    def __len__(self):
        if self.train is True:
            return len(self.train_Y)
        else:
            return len(self.test_Y)

def train_val_dataset(dataset, val_split=0.2):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), \
        test_size=val_split, shuffle=True, random_state=42)
    train_val_splits = {}
    train_val_splits['train'] = Subset(dataset, train_idx)
    train_val_splits['val'] = Subset(dataset, val_idx)
    return train_val_splits

def test_dataset(dataset):
    test_idx = list(range(len(dataset)))
    test_split = {}
    test_split['test'] = Subset(dataset, test_idx)
    return test_split


################################################################################################
class StandardScaler():
    """Standardize data by removing the mean and scaling to unit variance.
       This object can be used as a transform in PyTorch data loaders.

    Args:
        mean (FloatTensor): The mean value for each feature in the data.
        scale (FloatTensor): Per-feature relative scaling.
    """

    def __init__(self, mean=None, scale=None):
        if mean is not None:
            mean = torch.FloatTensor(mean)
        if scale is not None:
            scale = torch.FloatTensor(scale)
        self.mean_ = mean
        self.scale_ = scale

    def fit(self, sample):
        """Set the mean and scale values based on the sample data.
        """
        self.mean_ = sample.mean(0, keepdim=True)
        self.scale_ = sample.std(1, unbiased=False, keepdim=True)
        return self

    def __call__(self, sample):
        return (sample - self.mean_)/self.scale_

    def inverse_transform(self, sample):
        """Scale the data back to the original sample space.
        """
        return sample * self.scale_ + self.mean_
################################################################################################


# Create transform function to pass into CSVDataset class.
# transform = transforms.Compose([transforms.ToTensor(), StandardScaler(mean=0, scale=1)])

transform = StandardScaler()

"""
transform = StandardScaler()
transform = StandardScaler(mean=0, scale=1)
"""

# Test CSVDataset class definition & print train/test dataset sizes.
dataset = CSVDataset(file_path, train=True, transform=transform)
print(dataset.train_X)
print()

print(dataset.train_Y)
print()

print(dataset.test_X)
print()

# Print the length of the entire passed dataset.
print("Dataset Length:", len(dataset))
print()

# Split into train/validation sets & print respective lengths.
train_val_splits = train_val_dataset(dataset)
print("Train Set Length:", len(train_val_splits['train']))
print("Val Set Length:", len(train_val_splits['val']))
print()

# Pass train/validation splits into PyTorch DataLoader functions.
def cycle(iterable):
    while True:
        for x in iterable:
            yield x

train_loader = DataLoader(train_val_splits['train'], batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
train_iter = iter(cycle(train_loader))

val_loader = DataLoader(train_val_splits['val'], batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
val_iter = iter(cycle(val_loader))

# Acquire & print test batch of training/validation data.
dataiter = iter(train_loader)
train_batch = dataiter.next()
print(train_batch)
print()

dataiter2 = iter(val_loader)
val_batch = dataiter2.next()
print(val_batch)
print()

Thanks again!

Cheers

Can you show the error you got while running the code ? (it will be more useful if I know more about the error)

Hi suchith,

Thanks for replying!

Like I mentioned, I’ve been trying all sorts of modifications so a bit tough to keep track of, but the error I receive when running the posted code is as follows:

Traceback (most recent call last):
  File "d:/filepath.py", line 176, in <module>       
    train_batch = dataiter.next()
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 435, in __next__
    data = self._next_data()
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 475, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\dataset.py", line 272, in __getitem__
    return self.dataset[self.indices[idx]]
  File "filepath.py", line 65, in __getitem__     
    X_data = self.transform(X_data)
  File "filepath.py", line 122, in __call__       
    return (sample - self.mean_)/self.scale_
TypeError: unsupported operand type(s) for -: 'float' and 'NoneType'

Thanks and cheers!

You initialized transform = StandardScaler() which initializes transform.mean_ and transform.std_ to None

You didn’t call fit function and directly called __call__ method in CSVDataset which will result in the error you have shown.

Thanks suchith,

I apologize if a ‘stupid’ question, but I’m relatively new to coding - I’m a mining guy and not a computer scientist → can you indicate how I would properly call those two items you mentioned?

Thanks again in advance.

Cheers

These are some of my suggestions

  • StandardScaler is written in such a way that it assumes the input is a torch.tensor. You use this class in CSVDataset class
  • CSVDataset class reads the data from a file and stores in pandas dataframe (pd.read_csv). On this data you are using StandardScaler (but it takes input as tensor). So that’s why you were getting multiple error on input types.
  • One more thing to point out is that in your CSVDataset class, all the variables are np.array or pandas.dataframe. they will only be converted to torch.tensor at the dataloader.

These are my changes

class CSVDataset(Dataset):
    # Load the dataset.
    def __init__(self, file_path, train, transform=None):
        # Read the data into a dataframe.
        df = pd.read_csv(file_path)

        # Store the input & output variables.
        # `.values` will convert the pandas dataframe to numpy array.
        self.train_X = df.iloc[:, 1:-1].values.astype('float32').values
        self.train_Y = df.iloc[:, -1].values.astype('float32').values
        self.test_X = df.iloc[:, 1:-1].values.astype('float32').values
        self.test_Y = np.empty(len(self.test_X)).values
        self.transform = transform

        # call fit on training data.
        self.transform(self.train_X)
        self.train = train

    def __getitem__(self, index):
        if self.train is True:
            X_data = self.train_X[index]
            Y_data = self.train_Y[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
                Y_data = self.transform(Y_data)
            return X_data, Y_data

        else:
            X_data = self.test_X[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
            return X_data

    def __len__(self):
        if self.train is True:
            return len(self.train_Y)
        else:
            return len(self.test_Y)

def train_val_dataset(dataset, val_split=0.2):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), \
        test_size=val_split, shuffle=True, random_state=42)
    train_val_splits = {}
    train_val_splits['train'] = Subset(dataset, train_idx)
    train_val_splits['val'] = Subset(dataset, val_idx)
    return train_val_splits

def test_dataset(dataset):
    test_idx = list(range(len(dataset)))
    test_split = {}
    test_split['test'] = Subset(dataset, test_idx)
    return test_split


################################################################################################
class StandardScaler():
    """Standardize data by removing the mean and scaling to unit variance.
       This object can be used as a transform in PyTorch data loaders.

    Args:
        mean (FloatTensor): The mean value for each feature in the data.
        scale (FloatTensor): Per-feature relative scaling.
    """

    def __init__(self, mean=None, scale=None):
        self.mean_ = mean
        self.scale_ = scale

    def fit(self, sample):
        """Set the mean and scale values based on the sample data.
        """
        # change the function such that it works if `sample` is a `np. array`
        # 1) https://numpy.org/doc/stable/reference/generated/numpy.mean.html -> np.mean
        # 2) https://numpy.org/doc/stable/reference/generated/numpy.std.html -> np.std
        self.mean_ = sample.mean(0, keepdims=True)
        # changed the index to 0 (as you want to calculate std over samples)
        self.scale_ = sample.std(0, keepdims=True)
        return self

    def __call__(self, sample):
        return (sample - self.mean_)/self.scale_

    def inverse_transform(self, sample):
        """Scale the data back to the original sample space.
        """
        return sample * self.scale_ + self.mean_
################################################################################################


# Create transform function to pass into CSVDataset class.
# transform = transforms.Compose([transforms.ToTensor(), StandardScaler(mean=0, scale=1)])

transform = StandardScaler()

"""
transform = StandardScaler()
transform = StandardScaler(mean=0, scale=1)
"""

# Test CSVDataset class definition & print train/test dataset sizes.
dataset = CSVDataset(file_path, train=True, transform=transform)
print(dataset.train_X)
print()

print(dataset.train_Y)
print()

print(dataset.test_X)
print()

# Print the length of the entire passed dataset.
print("Dataset Length:", len(dataset))
print()


# Pass train/validation splits into PyTorch DataLoader functions.
def cycle(iterable):
    while True:
        for x in iterable:
            yield x

train_loader = DataLoader(train_val_splits['train'], batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
train_iter = iter(cycle(train_loader))

val_loader = DataLoader(train_val_splits['val'], batch_size=64, shuffle=True, num_workers=0, pin_memory=True)
val_iter = iter(cycle(val_loader))

# Acquire & print test batch of training/validation data.
dataiter = iter(train_loader)
train_batch = dataiter.next()
print(train_batch)
print()

dataiter2 = iter(val_loader)
val_batch = dataiter2.next()
print(val_batch)
print()

My changes are not the only way to do. do let me know if you get any error’s while running the code.

Hi suchith,

Thanks again for your valuable input! I’m leaving the office for the day but will definitely try the suggested changes first thing in the morning.

I’ll need to pay attention a bit around the fitting changes you’ve suggested as I need to fit/transform all train_X, train_Y, val_X, val_Y (and test_X which will come later). I was under the impression I would need to call the transform f’n under the getitem function in order to avoid data leakage, but I’ll have a closer look into my custom transform f’n as well - maybe I can change not to assume tensor as input and then simply call ‘transforms.ToTensor()’…?

Regardless, you’ve got me closer than I have been to date and I really appreciate it!

Will keep you posted tomorrow - cheers!

Good morning,

I’ve implemented the majority of your suggested changes, and the code now runs BUT only when I pass in a desired set of mean and scale values to the StandardScaler() class e.g., StandardScaler(0, 1).

In other words, my custom StandardScaler() class doesn’t seem to be properly calculating the mean and standard deviation for each feature or target variable.

I suspect this has to do with how you mentioned I need to properly call the fit function.

You had suggested:

        # call fit on training data.
        self.transform(self.train_X)
        self.train = train

But I felt I was doing so (or similar) in the getitem function definition:

def __getitem__(self, index):
        if self.train is True:
            X_data = self.train_X[index]
            Y_data = self.train_Y[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
                Y_data = self.transform(Y_data)
            return X_data, Y_data

        else:
            X_data = self.test_X[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
            return X_data

Here is the entire code as it currently stands:

class CSVDataset(Dataset):
    # Load the dataset.
    def __init__(self, file_path, train, transform=None):
        # Read the data into a dataframe.
        df = pd.read_csv(file_path)
        # Store the input & output variables.
        self.train_X = df.iloc[:, 1:-1].values.astype('float32')
        self.train_Y = df.iloc[:, -1].values.astype('float32')
        self.test_X = df.iloc[:, 1:-1].values.astype('float32')
        self.test_Y = np.empty(len(self.test_X))
        self.transform = transform

        
        self.train = train

    def __getitem__(self, index):
        if self.train is True:
            X_data = self.train_X[index]
            Y_data = self.train_Y[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
                Y_data = self.transform(Y_data)
            return X_data, Y_data

        else:
            X_data = self.test_X[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
            return X_data

    def __len__(self):
        if self.train is True:
            return len(self.train_Y)
        else:
            return len(self.test_Y)

def train_val_dataset(dataset, val_split=0.2):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), \
        test_size=val_split, shuffle=False, random_state=42)
    train_val_splits = {}
    train_val_splits['train'] = Subset(dataset, train_idx)
    train_val_splits['val'] = Subset(dataset, val_idx)
    return train_val_splits

def test_dataset(dataset):
    test_idx = list(range(len(dataset)))
    test_split = {}
    test_split['test'] = Subset(dataset, test_idx)
    return test_split


################################################################################################
class StandardScaler():
    """Standardize data by removing the mean and scaling to unit variance.
       This object can be used as a transform in PyTorch data loaders.

    Args:
        mean (FloatTensor): The mean value for each feature in the data.
        scale (FloatTensor): Per-feature relative scaling.
    """

    def __init__(self, mean=None, scale=None):
        if mean is not None:
            #mean = torch.FloatTensor(mean)
            mean = mean
        if scale is not None:
            scale = scale
            #scale = torch.FloatTensor(scale)
        self.mean_ = mean
        self.scale_ = scale

    def fit(self, sample):
        """Set the mean and scale values based on the sample data.
        """
        self.mean_ = sample.mean(axis=0, keepdims=True)
        self.scale_ = sample.std(axis=0, ddof=1, keepdims=True)
        return self

    def __call__(self, sample):
        return (sample - self.mean_)/self.scale_

    def inverse_transform(self, sample):
        """Scale the data back to the original sample space.
        """
        return sample * self.scale_ + self.mean_
################################################################################################

# Create transform function to pass into CSVDataset class.
transform = StandardScaler()

# Test CSVDataset class definition & print train/test dataset sizes.
dataset = CSVDataset(file_path, train=True, transform=transform)
print(dataset.train_X)
print()

print(dataset.train_Y)
print()

print(dataset.test_X)
print()

# Print the length of the entire passed dataset.
print("Dataset Length:", len(dataset))
print()

# Split into train/validation sets & print respective lengths.
train_val_splits = train_val_dataset(dataset)
print("Train Set Length:", len(train_val_splits['train']))
print("Val Set Length:", len(train_val_splits['val']))
print()

# Pass train/validation splits into PyTorch DataLoader functions.
def cycle(iterable):
    while True:
        for x in iterable:
            yield x

train_loader = DataLoader(train_val_splits['train'], batch_size=8, shuffle=False, num_workers=0, pin_memory=True)
train_iter = iter(cycle(train_loader))

val_loader = DataLoader(train_val_splits['val'], batch_size=8, shuffle=False, num_workers=0, pin_memory=True)
val_iter = iter(cycle(val_loader))

# Acquire & print test batch of training/validation data.
dataiter = iter(train_loader)
train_batch = dataiter.next()
print(train_batch)
print()

dataiter2 = iter(val_loader)
val_batch = dataiter2.next()
print(val_batch)
print()

And the error I’m getting is as follows:

Traceback (most recent call last):
  File "filepath", line 172, in <module>       
    train_batch = dataiter.next()
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 435, in __next__
    data = self._next_data()
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\dataloader.py", line 475, in _next_data
    data = self._dataset_fetcher.fetch(index)  # may raise StopIteration
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in fetch
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\_utils\fetch.py", line 44, in <listcomp>
    data = [self.dataset[idx] for idx in possibly_batched_index]
  File "C:\Users\rwils\Anaconda3\lib\site-packages\torch\utils\data\dataset.py", line 272, in __getitem__
    return self.dataset[self.indices[idx]]
  File "filepath", line 67, in __getitem__     
    X_data = self.transform(X_data)
  File "filepath", line 126, in __call__       
    return (sample - self.mean_)/self.scale_
TypeError: unsupported operand type(s) for -: 'float' and 'NoneType'

When I insert specified mean and standard deviation values to StandardScaler(), I get tensor outputs that correspond correctly to the selected batch sizes in the dataloaders, but this is not what I really need because the mean and std dev need to come from the population sample for each feature/target (i.e., column-wise).

Any more insights???

Thanks again!

Cheers

I am sorry and made a mistake in the code. The propery way to call fit function is as follows

        # call fit on training data.
        self.transform.fit(self.train_X)
        self.train = train

Whenever you call self.transform(X_train), this will redirect to __call__ function in Standard Scaler and where self.mean_ and self.scale_ are defined as None and this was the reason for error.

I hope above change works.

So interestingly, if I don’t fit the data to the transform method as you indicated, and pass in a mean of 0 and std of 1, the program outputs the original data values (as expected) with the correct training and validation splits according to batch size, as follows:

[tensor([[ 0.0000e+00,  0.0000e+00,  4.0000e-01,  3.8540e+01,  8.5000e-01,
          2.3000e-01,  0.0000e+00,  1.3000e-01,  1.1000e-01,  2.0000e-02, 
          1.0000e-02,  1.1630e+01,  8.5000e-01,  1.6280e+01,  1.6500e+00, 
          8.1630e+01,  9.4200e+00,  8.0080e+01,  4.1800e+00, -2.3000e-01, 
          0.0000e+00,  1.3000e-01,  0.0000e+00,  3.0000e-02,  2.1000e-01, 
          2.7000e-01,  4.0000e-02,  1.5000e-01,  2.0000e-02,  2.0000e-02, 
          2.0000e-02,  3.0000e-02,  3.0000e-02,  2.3000e-01,  4.6000e-01,
          1.0000e-02, -4.0000e-02,  5.0000e-02,  2.0000e-02,  1.1590e+01,
          6.9000e-01,  3.9500e+00,  1.7000e-01,  1.7000e-01,  1.6000e-01,
          7.0000e-01],
        [ 0.0000e+00,  1.0000e-02,  1.7000e-01,  3.8990e+01,  3.2000e-01,
          5.6000e-01,  0.0000e+00,  1.4000e-01,  6.0000e-02,  0.0000e+00,
          1.0000e-02,  1.3290e+01,  8.9000e-01,  1.5110e+01,  1.4400e+00,
          8.3080e+01,  5.7800e+00,  8.2350e+01,  1.2100e+00, -2.1000e-01,
          0.0000e+00,  8.0000e-02, -1.0000e-02,  7.0000e-02,  1.5700e+00,
          2.9000e-01,  0.0000e+00,  8.0000e-02,  1.0000e-02,  1.0000e-02,
          1.0000e-02,  5.0000e-02,  5.0000e-02,  3.6000e-01,  5.2000e-01,
          1.1000e-01, -6.9000e-01,  5.0000e-02,  2.0000e-02,  1.3110e+01,
          7.1000e-01,  9.9000e-01,  8.8000e-01,  9.0000e-02,  1.5000e-01,
          9.9000e-01]]), tensor([97.5300, 90.6500], dtype=torch.float64)]

[tensor([[ 0.0000e+00,  0.0000e+00,  2.8000e-01,  3.9370e+01,  7.6000e-01,
          2.2000e-01,  0.0000e+00,  4.0000e-02,  1.2000e-01,  1.2000e-01,
          1.0000e-02,  1.0240e+01,  2.2000e-01,  1.3970e+01,  3.5900e+00,
          8.2260e+01,  1.3830e+01,  8.1620e+01,  1.3000e-01,  2.4900e+00,
          1.0000e-02, -1.0000e-02, -1.0000e-02, -1.0000e-02,  3.5000e-01,
          2.2000e-01,  2.4000e-01,  1.7000e-01,  3.0000e-02,  3.0000e-02,
          2.0000e-02,  2.0000e-02,  2.0000e-02,  1.5000e-01,  1.9100e+00,
          0.0000e+00, -1.4000e-01,  5.0000e-02,  1.0000e-02,  1.0210e+01,
          1.0000e-01,  2.6300e+00,  2.0000e-01,  1.9000e-01, -3.0000e-02,
          2.0600e+00],
        [ 1.0000e+00,  0.0000e+00,  2.0000e-01,  4.0670e+01,  1.8000e-01,
          2.0000e-02,  0.0000e+00,  4.0000e-02,  7.0000e-02,  1.0000e-02,
          1.0000e-02,  9.3900e+00,  1.8000e-01,  1.4200e+01,  3.1200e+00,
          8.3160e+01,  2.4400e+00,  8.6000e+01,  1.0000e-01,  1.0000e-02,
          0.0000e+00, -1.0000e-02,  0.0000e+00, -2.0000e-02,  4.0000e-02,
          1.0000e-02,  2.0000e-02,  4.0000e-02,  4.0000e-02,  4.0000e-02,
          3.0000e-02,  2.0000e-02,  2.0000e-02,  1.4000e-01,  1.3600e+00,
          0.0000e+00, -8.0000e-02,  5.0000e-02,  1.0000e-02,  9.3900e+00,
          1.7000e-01,  1.1000e-01, -3.0000e-02,  9.0000e-02, -3.0000e-02,
          1.5000e+00]]), tensor([88.9200, 16.4300], dtype=torch.float64)]

Recall, I should have 46 features and 1 target for the current dataset (which is shown correctly for a batch size of 2 above).

HOWEVER, when I fit the transform as follows:

class CSVDataset(Dataset):
    # Load the dataset.
    def __init__(self, file_path, train, transform=None):
        # Read the data into a dataframe.
        df = pd.read_csv(file_path)
        # Store the input & output variables.
        self.train_X = df.iloc[:, 1:-1].values.astype('float32')
        self.train_Y = df.iloc[:, -1].values.astype('float32')
        self.test_X = df.iloc[:, 1:-1].values.astype('float32')
        self.test_Y = np.empty(len(self.test_X))
        self.transform = transform
        self.train = train

        # Call fit on each data type.
        self.transform.fit(self.train_X)
        self.transform.fit(self.train_Y)
        self.transform.fit(self.test_X)

    def __getitem__(self, index):
        if self.train is True:
            X_data = self.train_X[index]
            Y_data = self.train_Y[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
                Y_data = self.transform(Y_data)
            return X_data, Y_data

        else:
            X_data = self.test_X[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
            return X_data

    def __len__(self):
        if self.train is True:
            return len(self.train_Y)
        else:
            return len(self.test_Y)

And remove the passed in mean and std:

# Create transform function to pass into CSVDataset class.
transform = StandardScaler()

In this instance/attempt, I get the correct transformed values for my features (which is great!), but the values for my target variable are now INCORRECT (and for some reason gives 46 values, when there should only be 2 for the current batch size) - see below:

[tensor([[[-1.1340, -0.5446, -0.5598,  0.2025, -0.6553, -0.7396, -0.5494,
          -0.3097, -0.8017, -0.5771, -0.4696,  1.2344,  1.3403,  1.6143,
          -1.1446, -0.9779, -0.9181,  0.4569, -0.3281, -0.8027, -0.2748,
           0.0803, -0.4945, -0.1579, -0.7164,  0.4976, -0.5771,  0.0457,
          -0.8130,  0.1791,  0.3221,  0.1381,  0.1431,  0.0120, -0.8547,
          -0.5386, -0.2114, -0.4859,  0.5818,  1.2470,  1.1203, -0.5967,
          -0.8299, -0.8094, -0.4081, -0.7897]],

        [[-1.1340, -0.4796, -1.0978,  0.3648, -0.9092, -0.2407, -0.5494,
          -0.2918, -1.0933, -1.0547, -0.4696,  1.7783,  1.4826,  1.3624,
          -1.1927, -0.4703, -1.0529,  0.6592, -0.8991, -0.7988, -0.2748,
          -0.0201, -0.5004,  0.0947,  0.4627,  0.5856, -1.0547, -0.6919,
          -0.8472, -0.7163, -0.5982,  0.2875,  0.2861,  0.1185, -0.7848,
          -0.4816, -0.8184, -0.4859,  0.5818,  1.7256,  1.1938, -0.8942,
          -0.2697, -1.0866, -0.4131, -0.6521]]]), tensor([[[1.9404e+02, 6.3300e+02, 2.2663e+02, 2.1479e+01, 4.5670e+01,
          1.4638e+02, 4.3743e+02, 1.7439e+02, 5.6724e+02, 2.3280e+03,
          5.8455e+03, 2.9376e+01, 3.4521e+02, 1.9109e+01, 2.0818e+01,
          4.5882e+00, 2.3434e+00, 2.0127e+00, 1.7619e+01, 1.8198e+01,
          2.6207e+02, 1.9558e+02, 5.7333e+01, 6.1546e+02, 8.3662e+01,
          4.2819e+02, 1.1635e+03, 1.0261e+03, 3.3252e+02, 8.7306e+03,
          8.9749e+03, 7.2811e+02, 6.9760e+02, 7.9742e+01, 1.1220e+02,
          5.5030e+01, 9.0906e+01, 1.0680e+03, 6.5462e+03, 2.8306e+01,
          3.5664e+02, 8.8077e+00, 7.5992e+01, 3.3645e+02, 4.8128e+01,
          4.5134e+01]],

        [[1.8027e+02, 5.8831e+02, 2.1053e+02, 1.8997e+01, 4.2373e+01,
          1.3598e+02, 4.0653e+02, 1.6205e+02, 5.2713e+02, 2.1637e+03,
          5.4331e+03, 2.7122e+01, 3.2074e+02, 1.7628e+01, 1.9242e+01,
          2.1797e+00, 2.0888e+00, 1.3993e+00, 1.6296e+01, 1.6860e+01,
          2.4356e+02, 1.8177e+02, 5.3254e+01, 5.7202e+02, 7.7697e+01,
          3.9794e+02, 1.0813e+03, 9.5357e+02, 3.0900e+02, 8.1146e+03,
          8.3417e+03, 6.7674e+02, 6.4838e+02, 7.4105e+01, 1.0418e+02,
          5.1109e+01, 8.4481e+01, 9.9259e+02, 6.0844e+03, 2.6139e+01,
          3.3139e+02, 8.1163e+00, 7.0563e+01, 3.1262e+02, 4.4699e+01,
          4.1871e+01]]])]

[tensor([[[-1.1340, -0.5446, -0.8405,  0.5018, -0.6984, -0.7547, -0.5494,
          -0.4711, -0.7434,  1.8109, -0.4696,  0.7791, -0.9005,  1.1169,
          -0.7002, -0.7574, -0.7549,  0.5942, -1.1068, -0.2740, -0.2479,
          -0.2007, -0.5004, -0.4104, -0.5951,  0.2778,  1.8109,  0.2564,
          -0.7788,  1.0744,  0.3221,  0.0635,  0.0715, -0.0535,  0.8341,
          -0.5443, -0.3048, -0.4859, -0.0895,  0.8125, -1.0457, -0.7294,
          -0.8063, -0.7402, -0.5028, -0.1447]],

        [[ 0.8672, -0.5446, -1.0276,  0.9707, -0.9763, -1.0572, -0.5494,
          -0.4711, -1.0350, -0.8159, -0.4696,  0.5006, -1.0427,  1.1664,
          -0.8079, -0.4423, -1.1765,  0.9847, -1.1125, -0.7560, -0.2748,
          -0.2007, -0.4945, -0.4736, -0.8638, -0.6457, -0.8159, -1.1133,
          -0.7447,  1.9697,  1.2425,  0.0635,  0.0715, -0.0617,  0.1935,
          -0.5443, -0.2487, -0.4859, -0.0895,  0.5543, -0.7887, -0.9826,
          -0.9878, -1.0866, -0.5028, -0.4102]]]), tensor([[[ 1.7681e+02,  5.7707e+02,  2.0649e+02,  1.8373e+01,  4.1544e+01,       
           1.3336e+02,  3.9876e+02,  1.5894e+02,  5.1704e+02,  2.1224e+03,
           5.3293e+03,  2.6556e+01,  3.1459e+02,  1.7255e+01,  1.8846e+01,
           1.5741e+00,  2.0247e+00,  1.2450e+00,  1.5963e+01,  1.6524e+01,
           2.3891e+02,  1.7830e+02,  5.2228e+01,  5.6110e+02,  7.6197e+01,
           3.9033e+02,  1.0607e+03,  9.3535e+02,  3.0309e+02,  7.9597e+03,
           8.1825e+03,  6.6382e+02,  6.3601e+02,  7.2687e+01,  1.0217e+02,
           5.0124e+01,  8.2866e+01,  9.7362e+02,  5.9682e+03,  2.5595e+01,
           3.2503e+02,  7.9425e+00,  6.9198e+01,  3.0663e+02,  4.3837e+01,
           4.1051e+01]],

        [[ 3.1745e+01,  1.0618e+02,  3.6934e+01, -7.7721e+00,  6.8101e+00,
           2.3755e+01,  7.3233e+01,  2.8926e+01,  9.4359e+01,  3.9130e+02,
           9.8385e+02,  2.8070e+00,  5.6755e+01,  1.6466e+00,  2.2410e+00,
          -2.3803e+01, -6.5864e-01, -5.2180e+00,  2.0269e+00,  2.4353e+00,
           4.3920e+01,  3.2797e+01,  9.2472e+00,  1.0339e+02,  1.3347e+01,
           7.1560e+01,  1.9512e+02,  1.7158e+02,  5.5284e+01,  1.4694e+03,
           1.5107e+03,  1.2259e+02,  1.1746e+02,  1.3287e+01,  1.7745e+01,
           8.8178e+00,  1.5169e+01,  1.7906e+02,  1.1021e+03,  2.7709e+00,
           5.8906e+01,  6.5746e-01,  1.2000e+01,  5.5516e+01,  7.7021e+00,
           6.6707e+00]]])]

Any idea what I’m doing wrong now??? Sorry about this, but I really appreciate your help and it appears I’m almost where I need to be in order to begin building my ANN, so thank you!!!

Thanks in advance and cheers

Hello

Can I ask why did you fit on train_X, train_Y and test_X ?

You should only fit on train_X and not on train_Y or test_X.

You don’t have to do mean centralization of target (Y) and also while centralising test data you have to the use mean on train data.

Also one more change is that in __getitem__ you don’t have to do transformation on Y_data.

The code will look like this

if self.transform is not None:
   X_data = self.transform(X_data)
return X_data, Y_data

Hi again,

You are correct that I shouldn’t have been trying to fit to the test data, but I just wanted to mention that there are definitely instances when one would want to transform both feature and target variables.

Depending on data distributions (and stochastic variability of natural phenomena), transforming both features and targets can improve stability and help reduce gradient sizes (faster/more stable convergence).

Regardless, your questions and suggestions allowed me to get the code working exactly as I’d intended, so a BIG THANK YOU!!!

I ended up applying the data fitting within the getitem method, as follows:

    def __getitem__(self, index):
        if self.train is True:
            self.transform.fit(self.train_X)
            X_data = self.train_X[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
        
        if self.train is True:
            self.transform.fit(self.train_Y)
            Y_data = self.train_Y[index]
            if self.transform is not None:
                Y_data = self.transform(Y_data)
            return X_data, Y_data

        else:
            self.transform.fit(self.train_X)
            X_data = self.test_X[index]
            if self.transform is not None:
                X_data = self.transform(X_data)
            return X_data

This was my first time posting on a coding site/forum, and you’ve made it a positive experience for me - much appreciated!

Thanks again and cheers

1 Like

Hello,

I just wanted to mention that there are definitely instances when one would want to transform both feature and target variables.

To make this work you have to pass different transforms that is a transform for feature and transform for target. In this way each transform will store particular mean and std and don’t mess up with each other while calling transform function.

The code looks like this (This will give more clarity of what I am saying)

class CSVDataset(Dataset):
    # Load the dataset.
    def __init__(self, file_path, train, X_transform=None, y_transform=None):
        # Read the data into a dataframe.
        df = pd.read_csv(file_path)
        # Store the input & output variables.
        self.train_X = df.iloc[:, 1:-1].values.astype('float32')
        self.train_Y = df.iloc[:, -1].values.astype('float32')
        self.test_X = df.iloc[:, 1:-1].values.astype('float32')
        self.test_Y = np.empty(len(self.test_X))
        self.X_transform = X_transform
        self.y_transform = y_transform
        
        # fit on the training data
        self.X_transform.fit(train_X)
        self.y_transform.fit(train_Y)

        self.train = train

    def __getitem__(self, index):
        if self.train is True:
            X_data = self.train_X[index]
            Y_data = self.train_Y[index]
            if self.X_transform is not None:
                X_data = self.X_transform(X_data)
            if self.Y_transform is not None:
                Y_data = self.y_transform(Y_data)
            return X_data, Y_data
        else:
            X_data = self.test_X[index]
            if self.X_transform is not None:
                X_data = self.X_transform(X_data)
            return X_data

    def __len__(self):
        if self.train is True:
            return len(self.train_Y)
        else:
            return len(self.test_Y)

def train_val_dataset(dataset, val_split=0.2):
    train_idx, val_idx = train_test_split(list(range(len(dataset))), \
        test_size=val_split, shuffle=False, random_state=42)
    train_val_splits = {}
    train_val_splits['train'] = Subset(dataset, train_idx)
    train_val_splits['val'] = Subset(dataset, val_idx)
    return train_val_splits

def test_dataset(dataset):
    test_idx = list(range(len(dataset)))
    test_split = {}
    test_split['test'] = Subset(dataset, test_idx)
    return test_split


################################################################################################
class StandardScaler():
    """Standardize data by removing the mean and scaling to unit variance.
       This object can be used as a transform in PyTorch data loaders.

    Args:
        mean (FloatTensor): The mean value for each feature in the data.
        scale (FloatTensor): Per-feature relative scaling.
    """

    def __init__(self, mean=None, scale=None):
        if mean is not None:
            #mean = torch.FloatTensor(mean)
            mean = mean
        if scale is not None:
            scale = scale
            #scale = torch.FloatTensor(scale)
        self.mean_ = mean
        self.scale_ = scale

    def fit(self, sample):
        """Set the mean and scale values based on the sample data.
        """
        self.mean_ = sample.mean(axis=0, keepdims=True)
        self.scale_ = sample.std(axis=0, ddof=1, keepdims=True)
        return self

    def __call__(self, sample):
        return (sample - self.mean_)/self.scale_

    def inverse_transform(self, sample):
        """Scale the data back to the original sample space.
        """
        return sample * self.scale_ + self.mean_
################################################################################################

# Create transform function to pass into CSVDataset class.
X_transform = StandardScaler()
y_transform = StandardScaler()


# Test CSVDataset class definition & print train/test dataset sizes.
dataset = CSVDataset(file_path, train=True, X_transform=X_transform, y_transform=y_transform)
print(dataset.train_X)
print()

print(dataset.train_Y)
print()

print(dataset.test_X)
print()

# Print the length of the entire passed dataset.
print("Dataset Length:", len(dataset))
print()

# Split into train/validation sets & print respective lengths.
train_val_splits = train_val_dataset(dataset)
print("Train Set Length:", len(train_val_splits['train']))
print("Val Set Length:", len(train_val_splits['val']))
print()

# Pass train/validation splits into PyTorch DataLoader functions.
def cycle(iterable):
    while True:
        for x in iterable:
            yield x

train_loader = DataLoader(train_val_splits['train'], batch_size=8, shuffle=False, num_workers=0, pin_memory=True)
train_iter = iter(cycle(train_loader))

val_loader = DataLoader(train_val_splits['val'], batch_size=8, shuffle=False, num_workers=0, pin_memory=True)
val_iter = iter(cycle(val_loader))

# Acquire & print test batch of training/validation data.
dataiter = iter(train_loader)
train_batch = dataiter.next()
print(train_batch)
print()

dataiter2 = iter(val_loader)
val_batch = dataiter2.next()
print(val_batch)
print()