Combining Subsets Pytorch

Hi,

I was playing around with MNIST and I came up with the following concept:

  1. I will create 5 subsets of the training set e.g. a,b,c,d,e where b will have 25% common data of a and rest will be unique, c will have 50% common data of b and the rest will be unique, and so on.
  2. Each subset will have the same length (11000)
  3. As trainset_a will be unique, it was easier for me to take the subset of length 11,000 from the main trainset of MNIST.
  4. For trainset_b, I created two subsets: one contains 25% date of trainset_a, other contains 75% unique data.

Now I want to combine these two subsets in such a way that it treats as a single subset. What I want to say is if I call the data loader upon trainset_b (trainset_b_loader), then by calling trainset_b_load.dataset I can access all of the data from two subsets without creating any subfolder as dataset/ index for two different subsets under dataset. As I am a newbie, I am stuck at this point and unable to find a way to achieve the goal.

My code is given below. Any help would be highly appreciated.

from __future__ import print_function, division

import torch
import torch.nn as nn
import torch.optim as optim
from torch.optim import lr_scheduler
import numpy as np
import torchvision
from torchvision import datasets, models, transforms
from torch.utils.data import Subset, Dataset, DataLoader
import matplotlib.pyplot as plt
import time
import os
import copy
import pandas as pd
import random

from torch.utils.data import Subset
from PIL import Image
from torchvision.datasets import MNIST, FashionMNIST

import torchvision.transforms as transforms

#plt.ion()   # interactive mode
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# torch.cuda.set_device(device)

def get_target_label_idx(labels, targets, shots=5, test=False):
    """
    Get the indices of labels that are included in targets.
    :param labels: array of labels
    :param targets: list/tuple of target labels
    :return: list with indices of target labels
    """
    final_list = []
    # Both if and else operations seem to be the same, what would be the purpose of this?
    for t in targets:
        if test:
            final_list += np.argwhere(np.isin(labels, t)).flatten().tolist()
        else:
            final_list += np.argwhere(np.isin(labels, t)).flatten().tolist()

    return final_list


def convert_label(x):
    if x >= 5:
        return x - 5
    else:
        return x


normal_classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

transform = transforms.Compose([transforms.ToTensor()])

train_set = torchvision.datasets.MNIST(root='./data', train=True,
                                       download=True, transform=transform)

trainloader_test = torch.utils.data.DataLoader(train_set, batch_size=4,
                                          shuffle=True, num_workers=2)

train_index = get_target_label_idx(train_set.train_labels.clone().data.cpu().numpy(), normal_classes)
random.shuffle(train_index)

# Split train_index into two batch:

train_index_unseen = train_index[0:5000]
train_index_rest = train_index[5000:]

# print(type(train_index_rest))

# This data will be used in the attack model
mnist_unseen = Subset(trainloader_test.dataset, train_index_unseen)  # -----------unseen

# This data will be splitted into 5 batches
mnist_trainset_rest = Subset(trainloader_test.dataset, train_index_rest)
mnist_trainset_rest_loader = torch.utils.data.DataLoader(mnist_trainset_rest.dataset, shuffle=True, num_workers=2)

# This will return the length of each batch
train_set_split_length = int(len(train_index_rest) / 5)  # --------Each trainset size: 11000

# train_set_b: To choose 25% trainset from train_set_a
# Each train set will contain 11000 datapoints

common_portion_b = int((train_set_split_length * (25 / 100)))  # ----25% common: 2750
unique_portion_b = train_set_split_length - common_portion_b
# print(unique_portion_b)
rest_portion_b = train_set_split_length + unique_portion_b
# print(rest_portion_b)

# train_set_c: To choose 50% trainset from train_set_b
# Each train set will contain 11000 datapoints

common_portion_c = int((train_set_split_length * (50 / 100)))  # ----50% common: 5500
unique_portion_c = train_set_split_length - common_portion_c
# print(unique_portion_c)
rest_portion_c = rest_portion_b + unique_portion_c
# print(rest_portion_c)


# train_set_d: To choose 75% trainset from train_set_c
# Each train set will contain 11000 datapoints

common_portion_d = int((train_set_split_length * (75 / 100)))  # ----50% common: 8250
unique_portion_d = train_set_split_length - common_portion_d
rest_portion_d = rest_portion_c + unique_portion_d
# print(rest_portion_d)

# First trainset- Unique Trainset
train_set_a = Subset(mnist_trainset_rest_loader.dataset, train_index_rest[0:train_set_split_length])
# train_set_a_df = PandasDataset(train_set_a)
train_set_a_loader = torch.utils.data.DataLoader(train_set_a.dataset, batch_size=4,
                                                 shuffle=True, num_workers=2)

# Second trainset- 25% common of first Trainset
train_set_b_1 = Subset(train_set_a_loader.dataset, train_index_rest[0:common_portion_b])
train_set_b_2 = Subset(mnist_trainset_rest.dataset, train_index_rest[train_set_split_length:rest_portion_b])
train_set_b = ????

If I understand the use case correctly, you would like to create a new Dataset by concatenating train_set_b_1 and train_set_b_2?
If so, you could use ConcatDataset and pass both datasets to it.

1 Like