How to apply image transformation to a pre-constructed dataset?

My data preprocess pipeline is: construct training set and test set of CIFAR10; add different perturbations to training set and test set; apply image transformations, thus I can’t do image transformations by specifying transform argument in torchvision.datasets.cifar.CIFAR10. So I custom the CIFAR10 dataset.

cifar.py
import pickle
import os

import numpy as np
from PIL import Image
from torch.utils.data.dataset import Dataset


class CIFAR10(Dataset):
    def __init__(self, data_dir, transform=None, evaluation=False):
        self.classes = 10
        self.transform = transform
        if not evaluation:
            data_list = [
                "data_batch_1", "data_batch_2", "data_batch_3", "data_batch_4",
                "data_batch_5"
            ]
        else:
            data_list = ["test_batch"]

        data = []
        targets = []
        for file_name in data_list:
            file_path = os.path.join(data_dir, file_name)
            with open(file_path, "rb") as f:
                entry = pickle.load(f, encoding="latin1")
            data.append(entry["data"])
            targets.extend(entry["labels"])

        data = np.vstack(data).reshape(-1, 3, 32, 32)
        # convert to NHWC (np.ndarray) works better with transformations.
        self.data = data.transpose((0, 2, 3, 1))
        self.targets = np.asarray(targets)

    def __getitem__(self, index):
        img, target = self.data[index], self.targets[index]
        img = Image.fromarray(img)
        if self.transform is not None:
            img = self.transform(img)
        return img, target

    def __len__(self):
        return len(self.data)

And I write a function to apply image transformations to a pre-constructed dataset.

def dst_transforms(dataset, transform):
    data_loader = DataLoader(dataset, batch_size=len(dataset), num_workers=3)
    data, targets = list(data_loader)[0]  # data_loader only has one batch
    transformed_data = torch.zeros(data.shape).permute(0, 3, 1, 2)
    for i, d in enumerate(data):
        transformed_data[i] = transform(d.numpy())  # type(d) is torch.Tensor
    transformed_dst = TensorDataset(transformed_data, targets)
    return transformed_dst

However, when I preprocess data with the same transformation, I got lower accuracy with dst_transforms than specifying transform argument. What’s wrong in my dst_transformation? Any help is appreciated!

Following with an example code.

main.py
import numpy as np
import setGPU
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader, TensorDataset

import cifar


def dst_transforms(dataset, transform):
    data_loader = DataLoader(dataset, batch_size=len(dataset), num_workers=3)
    data, targets = list(data_loader)[0]  # data_loader only has one batch
    transformed_data = torch.zeros(data.shape).permute(0, 3, 1, 2)
    for i, d in enumerate(data):
        transformed_data[i] = transform(d.numpy())  # type(d) is torch.Tensor
    transformed_dst = TensorDataset(transformed_data, targets)
    return transformed_dst


def train(model, train_loader, epoch):
    device = torch.device("cuda")
    model = model.to(device)
    model.train()
    criterion = nn.CrossEntropyLoss()
    optim = torch.optim.Adam(model.parameters(), lr=0.001)
    train_loss = 0
    correct = 0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        optim.zero_grad()
        output = model(data)
        loss = F.cross_entropy(output, target)
        loss.backward()
        optim.step()

        train_loss += loss.item()
        pred = output.argmax(dim=1, keepdim=True)
        correct += pred.eq(target.view_as(pred)).sum().item()

    train_loss /= len(train_loader.dataset)
    acc = 100. * correct / len(train_loader.dataset)

    return train_loss, acc


def test(model, test_loader, epoch):
    device = torch.device("cuda")
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for batch_idx, (data, target) in enumerate(test_loader):
            data, target = data.to(device), target.to(device)
            output = model(data)
            loss = F.cross_entropy(output, target)

            test_loss += loss.item()
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    acc = 100. * correct / len(test_loader.dataset)

    return test_loss, acc


def main():
   print('==> Preparing data..')
    train_transform = transforms.Compose([
        # cifar dataset consists of np.ndarray (NHWC).
        #transforms.ToPILImage(),
        #transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),  # NHWC => NCHW
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])
    test_transform = transforms.Compose([
        transforms.ToTensor(),  # NHWC => NCHW
        transforms.Normalize((0.4914, 0.4822, 0.4465),
                             (0.2023, 0.1994, 0.2010))
    ])

    train_data = cifar.CIFAR10("./cifar10/cifar-10-batches-py",
                               transform=train_transform)
    test_data = cifar.CIFAR10("./cifar10/cifar-10-batches-py",
                              transform=train_transform,
                              evaluation=True)

    #train_data = dst_transforms(train_data, train_transform)
    train_loader = DataLoader(train_data,
                              batch_size=128,
                              num_workers=3,
                              shuffle=True)
    #test_data = dst_transforms(test_data, test_transform)
    test_loader = DataLoader(test_data,
                             batch_size=100,
                             num_workers=3,
                             shuffle=False)

    model = models.resnet18()
    for epoch in range(200):
        train_loss, train_acc = train(model, train_loader, epoch)
        test_loss, test_acc = test(model, test_loader, epoch)
        print(
            "train loss: {}, train accuracy: {}, test loss: {}, test accuracy: {}\n"
            .format(train_loss, train_acc, test_loss, test_acc))


if __name__ == "__main__":
    main()

Could you check the dtype as well as the min and max value of a data sample using your custom CIFAR10 dataset?
I suspect that based on the dtype some transformations will work differently (e.g. ToTensor).

Thanks for your reply! The dtype is torch.uint8, and the min value is 0 while the max value is 255 when not specifying transform argument. I found that when using the same image transformations, the TensorDataset return by dst_transforms function having the same value with specifying transform argument in custom cifar10 dataset. However, I didn’t figure out why they have different training results.

Thanks for the information.
If both transformations yield the same output, could the training itself be a bit flaky?
I.e. is the lower accuracy reproducible and how large is the difference?

I figure it out. The model is initialized randomly which causes the lower accuracy. And I find that by customize lambda transformations I can make the data preprocess pipeline more efficient and elegant. Thank you anyway!