Here is my custom dataset:
class BirdsDataset(Dataset):
“”“Dataset class for Bird images.”“”
def __init__(self,partition):
super().__init__()
np.random.seed(42)
random.seed(42)
imgPath = "./CUB_200_2011/"
classes_path = os.path.join(imgPath,"classes.txt")
self.partition = partition
#Create lookup table for img_id --> class
id_to_class_data = pd.read_csv(classes_path,sep=' ', names=['img_id', 'class'])
self.id_lookup = {}
for i, row in id_to_class_data.iterrows():
self.id_lookup[row["img_id"]]= row["class"].split('.')[1]
self.X, self.y = self._load_data()
#Lookup table getter function
def lookup(self,img_id):
return self.id_lookup[img_id]
#Resizes shortest axis to become 256
# Example 1: 500x300 --> 500x256
# Example 2: 300x500 --> 256x500
def resizer(self,img):
width, height = img.size
if(height<width):
return img.resize((width,256))
else:
return img.resize((256,height))
#Center crops image along longer axis
# Example 1: 500x256 --> 256x256 (center cropped)
# Example 2: 256x500 --> 256x256 (center cropped)
def cropper(self,img):
width, height = img.size
if width < height:
extra_height = (height-256)/2
return img.crop((0,extra_height,256, height-extra_height))
elif height < width:
extra_width = (width-256)/2
return img.crop((extra_width,0, width - extra_width, 256))
return img
def __len__(self):
"""Return size of dataset."""
return len(self.X)
def __getitem__(self, idx):
"""Return (image, label) pair at index `idx` of dataset."""
return torch.from_numpy(np.array(self.X[idx])), torch.tensor(self.y[idx])
def _load_data(self):
"""Load a single data partition from file."""
imgPath = "./CUB_200_2011/"
images_path = os.path.join(imgPath,"images.txt")
num = 11788
imgPath = "./CUB_200_2011/"
images_path = os.path.join(imgPath,"images.txt")
labels_path = os.path.join(imgPath,"image_class_labels.txt")
#imgPaths:= nx2: id, fileName
img_paths = pd.read_csv(images_path, sep=' ',names=["img_id","path"])
#img labels
image_class_labels = pd.read_csv(labels_path,sep=' ', names=['img_id', 'y'])
train_test_split = pd.read_csv(os.path.join(imgPath,'train_test_split.txt'),
sep=' ', names=['img_id', 'is_training_img'])
#combined paths and labels by id
data = img_paths.merge(image_class_labels, on='img_id')
data = data.merge(train_test_split, on='img_id')
data = data.set_index("img_id")
if self.partition not in ["train", "val", "test"]:
raise ValueError("Partition {} does not exist".format(partition))
if self.partition == "train":
self.curr = data[data["is_training_img"] == 1]
else:
self.test_data = data[data["is_training_img"] == 0]
if self.partition == "val":
self.curr = self.test_data.iloc[:int(self.test_data.shape[0]/2),:]
else:
self.curr = self.test_data.iloc[int(self.test_data.shape[0]/2):,:]
parentPath = os.path.join(imgPath,"images")
#Create X & y output
X, y = [], []
for i, row in self.curr.iterrows():
label = row["y"]
img = Image.open(os.path.join(parentPath, row["path"]))
img = self.resizer(img)
img = self.cropper(img)
X.append(img)
y.append(row["y"])
print(len(X),len(y))
return np.array(X, dtype=object), np.array(y)
I then run this code to create a dataset and data-loader instance
va_dataset = BirdsDataset(“val”)
loader = DataLoader(va_dataset, batch_size=16, shuffle=False, num_workers=10, drop_last=True)
Then upon looping through (as shown below) the error message appears
for i, (X,y) in enumerate(loader):
print(“X type:”,len(X))
print(“y type:”, len(y))
print(i)