Before I had only train and test dataset. Now, I want to split the dataset to train, validation and test. Also, switch between two phases (train and validation) every epoch in order to help me to adjust the hyper parameter.
This is snippet to split the dataset. I don’t know why validation and train is giving me wrong percentage of data.
# get all the image and mask path and number of images
folder_data = glob.glob("D:\\Neda\\Pytorch\\U-net\\my_data\\imagesResized\\*.png")
folder_mask = glob.glob("D:\\Neda\\Pytorch\\U-net\\my_data\\labelsResized\\*.png")
# split these path using a certain percentage
len_data = len(folder_data)
print("count of dataset: ", len_data)
# count of dataset: 992
train_size = 0.6
val_size = 0.2
test_size = 0.2
train_image_paths = folder_data[:int(len_data*train_size)]
print("count of train images is: ", len(train_image_paths))
#count of train images is: 595
valid_image_paths = folder_data[int(len_data*val_size)]
print("count of validation image is: ", len(valid_image_paths))
#count of validation image is: 57
test_image_paths = folder_data[int(len_data*test_size)]
print("count of test images is: ", len(test_image_paths))
#count of test images is: 57
#print(test_image_paths)
train_mask_paths = folder_mask[:int(len_data*train_size)]
valid_mask_paths = folder_mask[:int(len_data*val_size)]
test_mask_paths = folder_mask[int(len_data*test_size):]
train_dataset = CustomDataset(train_image_paths, train_mask_paths)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=1, shuffle=True, num_workers=2)
valid_dataset = CustomDataset(valid_image_paths, valid_mask_paths)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=1, shuffle=False, num_workers=2)
test_dataset = CustomDataset(test_image_paths, test_mask_paths)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False, num_workers=2)
DataLoader = {
'train': train_loader,
'valid': valid_loader,
#'test': test_loader
}
also, I tried this but it’s return zero for validation and test is wrong as well.
len_data = len(folder_data)
print("count of dataset: ", len_data)
# count of dataset: 992
train_size = 0.6
val_size = 0.2
test_size = 0.2
train_image_paths = folder_data[:int(len_data*train_size)]
print("count of train images is: ", len(train_image_paths))
#count of train images is: 595
valid_image_paths = folder_data[int(len_data*train_size):int(len_data*val_size)]
print("count of validation image is: ", len(valid_image_paths))
#count of validation image is: 0
test_image_paths = folder_data[int(len_data*val_size):]
print("count of test images is: ", len(test_image_paths))
#count of test images is: 794
moreover, for switch between train and validation I am going to use this tutorial in Pytorch but this is for transfer learning and I am not doing transfer learning. would that be OK?