I was playing around with MNIST and I came up with the following concept:
- I will create 5 subsets of the training set e.g. a,b,c,d,e where b will have 25% common data of a and rest will be unique, c will have 50% common data of b and the rest will be unique, and so on.
- Each subset will have the same length (11000)
- As trainset_a will be unique, it was easier for me to take the subset of length 11,000 from the main trainset of MNIST.
- For trainset_b, I created two subsets: one contains 25% date of trainset_a, other contains 75% unique data.
Now I want to combine these two subsets in such a way that it treats as a single subset. What I want to say is if I call the data loader upon trainset_b (trainset_b_loader), then by calling trainset_b_load.dataset I can access all of the data from two subsets without creating any subfolder as dataset/ index for two different subsets under dataset. As I am a newbie, I am stuck at this point and unable to find a way to achieve the goal.
My code is given below. Any help would be highly appreciated.
from __future__ import print_function, division import torch import torch.nn as nn import torch.optim as optim from torch.optim import lr_scheduler import numpy as np import torchvision from torchvision import datasets, models, transforms from torch.utils.data import Subset, Dataset, DataLoader import matplotlib.pyplot as plt import time import os import copy import pandas as pd import random from torch.utils.data import Subset from PIL import Image from torchvision.datasets import MNIST, FashionMNIST import torchvision.transforms as transforms #plt.ion() # interactive mode device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # torch.cuda.set_device(device) def get_target_label_idx(labels, targets, shots=5, test=False): """ Get the indices of labels that are included in targets. :param labels: array of labels :param targets: list/tuple of target labels :return: list with indices of target labels """ final_list =  # Both if and else operations seem to be the same, what would be the purpose of this? for t in targets: if test: final_list += np.argwhere(np.isin(labels, t)).flatten().tolist() else: final_list += np.argwhere(np.isin(labels, t)).flatten().tolist() return final_list def convert_label(x): if x >= 5: return x - 5 else: return x normal_classes = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9] transform = transforms.Compose([transforms.ToTensor()]) train_set = torchvision.datasets.MNIST(root='./data', train=True, download=True, transform=transform) trainloader_test = torch.utils.data.DataLoader(train_set, batch_size=4, shuffle=True, num_workers=2) train_index = get_target_label_idx(train_set.train_labels.clone().data.cpu().numpy(), normal_classes) random.shuffle(train_index) # Split train_index into two batch: train_index_unseen = train_index[0:5000] train_index_rest = train_index[5000:] # print(type(train_index_rest)) # This data will be used in the attack model mnist_unseen = Subset(trainloader_test.dataset, train_index_unseen) # -----------unseen # This data will be splitted into 5 batches mnist_trainset_rest = Subset(trainloader_test.dataset, train_index_rest) mnist_trainset_rest_loader = torch.utils.data.DataLoader(mnist_trainset_rest.dataset, shuffle=True, num_workers=2) # This will return the length of each batch train_set_split_length = int(len(train_index_rest) / 5) # --------Each trainset size: 11000 # train_set_b: To choose 25% trainset from train_set_a # Each train set will contain 11000 datapoints common_portion_b = int((train_set_split_length * (25 / 100))) # ----25% common: 2750 unique_portion_b = train_set_split_length - common_portion_b # print(unique_portion_b) rest_portion_b = train_set_split_length + unique_portion_b # print(rest_portion_b) # train_set_c: To choose 50% trainset from train_set_b # Each train set will contain 11000 datapoints common_portion_c = int((train_set_split_length * (50 / 100))) # ----50% common: 5500 unique_portion_c = train_set_split_length - common_portion_c # print(unique_portion_c) rest_portion_c = rest_portion_b + unique_portion_c # print(rest_portion_c) # train_set_d: To choose 75% trainset from train_set_c # Each train set will contain 11000 datapoints common_portion_d = int((train_set_split_length * (75 / 100))) # ----50% common: 8250 unique_portion_d = train_set_split_length - common_portion_d rest_portion_d = rest_portion_c + unique_portion_d # print(rest_portion_d) # First trainset- Unique Trainset train_set_a = Subset(mnist_trainset_rest_loader.dataset, train_index_rest[0:train_set_split_length]) # train_set_a_df = PandasDataset(train_set_a) train_set_a_loader = torch.utils.data.DataLoader(train_set_a.dataset, batch_size=4, shuffle=True, num_workers=2) # Second trainset- 25% common of first Trainset train_set_b_1 = Subset(train_set_a_loader.dataset, train_index_rest[0:common_portion_b]) train_set_b_2 = Subset(mnist_trainset_rest.dataset, train_index_rest[train_set_split_length:rest_portion_b]) train_set_b = ????