Hi, I am trying to simulate the label shift problem. To do so, I need to make custom datasets (in this case CIFAR10) and give the number of images in each class. For example, [5000, 3000, 1500,…], which has a length of 10 because there are 10 classes. I have attached my code below. The code seems to work well but the problem is that when I set all of the elements of the vector to 5000, which means the whole training set, I get an accuracy of 81% on ResNet18, while I get 93% accuracy on the original dataset with the same parameters of the neural network. I also checked the tensor values of images in both cases and they look alike.
Any help would be appreciated.
class MyDataset(Dataset):
def __init__(self, data, target, domain, transform=None):
self.data = data
self.target = target
self.transform = transform
self.domain = domain
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
if self.transform:
x = self.transform(x)
return x, y, index, self.domain
def __len__(self):
return len(self.data)
def Custom_Dataset(custom_labels, train):
target_labels = np.zeros(sum(custom_labels))
images = []
labels = []
if train == True:
for i in range(50000):
img, label = trainset.__getitem__(i)
images.append(np.array(img))
labels.append(label)
images = np.array(images)
labels = np.array(labels)
if train == False:
for i in range(10000):
img, label = testset.__getitem__(i)
images.append(np.array(img))
labels.append(label)
images = np.array(images)
labels = np.array(labels)
# data has 10 lists. each of them consisting of indexes of each category
data = np.empty(10, dtype=np.object)
for i in range(data.shape[0]):
data[i] = []
for j in range(10):
indexes=[]
for i in range(len(labels)):
if labels[i] == j:
indexes.append(i)
data[j]=indexes
temp=[]
category=0
for label in custom_labels:
for i in range(label):
temp.append(data[category][i])
category = category+1
random.shuffle(temp)
cut_images = images[0:sum(custom_labels)]
target_images = np.zeros_like(cut_images)
count=0
for i in range(len(temp)):
target_images[i] = images[temp[i]]
for j in range(10):
if temp[i] in data[j]:
target_labels[count] = j
count = count+1
return target_images, target_labels
custom_labels_train = [5000,5000,500,500,500,500,500,500,5000,5000]
target_images, target_labels = Custom_Dataset(custom_labels_train, train=True)
MyTrainSet = MyDataset(data = target_images, target = target_labels, domain = 1)
trainloader = torch.utils.data.DataLoader(MyTrainSet, batch_size=128, shuffle=True, num_workers=0)