Sample balanced data

NearIt · June 12, 2020, 7:34am

i am using

FashionMNIST_train_dataset = torchvision.datasets.FashionMNIST(
    root = './data/FashionMNIST',
    train = True,
    download = True,
    transform = transforms.Compose([
        transforms.ToTensor()                                 
    ])
)

and I want SPLIT this data to N balanced labeled data, and the rest (60000-N).
how can I do that?

ptrblck · June 12, 2020, 9:46am

You could use e.g. sklearn.model_selection.StratifiedShuffleSplit (or any other suitable splitting function) to create the split indices.
Once you have the indices, you could pass your dataset together with the split indices to a Subset and pass it then to a DataLoader.

NearIt · June 12, 2020, 7:25pm

this is what i did:

def GetIndices (data,N):
  # given data, divide it to N balanced labels samples.
  # return
  
  Nclasses = len(data.classes)
  Ndata = len(data.data)
  count = [[] for x in range(Nclasses)]   #init. count contain the idx of each label                                               
  for idx, (_,label) in enumerate(data):
    count[label].append(idx)
  # this is to check that this is balance label data 
  #for i in range(len(count)):
  #  Nsamples= len(count[i])
  #  print("label " + str(i) + " has " + str(Nsamples) + " images") 

  #divide
  NsampleToTakeFromEachLabel = math.floor(N / Nclasses)
  #print("total data len = " + str(Ndata) + ",  Nclasses = " +  str(Nclasses) + ",  N=" +str(N) + ",  NsampleToTakeFromEachLabel = " +str(NsampleToTakeFromEachLabel))
  picked_indices=[];
  not_picked_indices = [];
  for i in range(len(count)):
    picked_idx = random.sample(count[i],NsampleToTakeFromEachLabel);
    for j in range(len(count[i])):
      if count[i][j] in picked_idx:
        picked_indices.append(count[i][j])
      else :
        not_picked_indices.append(count[i][j])
  
  return picked_indices , not_picked_indices

def DivideData (train_dataset,test_dataset,N):
  picked_indices , not_picked_indices = GetIndices(train_dataset,N)

  trainloader = torch.utils.data.DataLoader(torch.utils.data.Subset(train_dataset, picked_indices))
  testloader1 = torch.utils.data.DataLoader(torch.utils.data.Subset(train_dataset, not_picked_indices))
  testloader2 = torch.utils.data.DataLoader(test_dataset, shuffle=True)

  return trainloader , testloader1 ,testloader2

trainloader , testloader1 ,testloader2 = DivideData(FashionMNIST_train_dataset,FashionMNIST_test_dataset,100)

any idea how to concat testloader1 with testloader2

ptrblck · June 13, 2020, 5:53am

You could use ConcatDataset on the Subset and test_dataset and pass this concatenated dataset to a DataLoader.