Data loader returning series

Hello,

I’m trying to return a batch of images and numerical data from a dataloader. The images seem to be coming out correct. But it’s returning the batches of the whole data set.

class Inspection_Dataset(Dataset):
        """
        df: Dataframe containing all categorical, numerical and image columns
        numerical columns: list of numerical columns
        cat_columns: list of categorical columns
        image: column containing image file name
        root_dir: column containing root directory
        
        """
        def __init__(self, df, numerical_columns, cat_columns, image, 
                     root_dir, label, transform = None):
            
            self.df = df
            self.numerical_columns = numerical_columns
            self.cat_columns = cat_columns
            self.image_column = image
            self.root_dir = root_dir
            self.label = label
            self.transform = transform
            
        def __len__(self):
            return len(self.df)
            
        def __getitem__(self, idx):
            idx = list(self.df.index)
            
            #image
            image = Image.open(os.path.join(self.df.loc[idx, self.root_dir].values[0],
                                            self.df.loc[idx, self.image_column].values[0]))
            image = self.transform(image)
            
            #numerical columns
            numericals = np.asarray(self.df.loc[idx, self.numerical_columns].values)       
            
            return image, numericals
        
train_data = Inspection_Dataset(train_sample,
                                numerical_columns = numerical_columns,
                                cat_columns = non_loca_cat_columns,
                                image = 'file',
                                root_dir = 'root',
                                label = 'target',
                                transform = train_transform)

train_loader = DataLoader(train_data, batch_size = 10, shuffle = True)

for image, numericals in train_loader:
    break
print(image.size(), numericals.size())

From that final print statement, I get:

torch.Size([10,3,224,224]) torch.Size([10, 8345, 6])

I would think it should be:

torch.Size([10,3,224,224]) torch.Size([10,  6])

or

torch.Size([10,3,224,224]) torch.Size([10,  1, 6])

Does anyone see what I’m doing wrong?

Hi,

Try to run this line out of the dataloader and see what its shape is. Obviously, the issue is the way numericals is constructed as dataloader only create a batch of that data which as you have mentioned, [10, ..., ...] corresponds to batch size you have defined.

Thank you @Nikronic. I ended up rebuilding it like the below and it seems to work:

class Inspection_Dataset(Dataset):
        """
        df: Dataframe containing all categorical, numerical and image columns
        numerical columns: list of numerical columns
        cat_columns: list of categorical columns
        image: column containing image file name
        root_dir: column containing root directory
        
        """
        def __init__(self, df, numerical_columns = None,
                     cat_columns = None,
                     image = None, 
                     root_dir = None, 
                     label = None, 
                     transform = None):
            
            #df
            self.df = df
            #transform
            self.transform = transform
            #image
            self.image_column = image
            self.root_dir = root_dir
            
            #length
            self.n = df.shape[0]
            
            #output column
            self.label = np.array(self.df.loc[:, label])
           
            #cat columns
            self.cat_columns = cat_columns if cat_columns else []
            self.numerical_columns = [col for col in df[numerical_columns]]
                            
            if self.cat_columns:
                for column in self.cat_columns:
                    df[column] = df.loc[:, column].astype('category')
                    df[column] = df[column].cat.codes
                self.cat_columns = np.array(df[cat_columns]) 
            else:
                self.cat_columns = np.zeros((self.n, 1))  
            
            #numerical columns
            if self.numerical_columns:
                self.numerical_columns = df[self.numerical_columns].astype(np.float32).values
            else:
                self.numerical_columns = np.zeros((self.n, 1))   
                     
        def __len__(self):
            return self.n

            
        def __getitem__(self, idx):
            idx = list(self.df.index)
            
            image = Image.open(os.path.join(self.df.loc[idx, self.root_dir].values[0],
                                             self.df.loc[idx, self.image_column].values[0]))
            image = self.transform(image)
        
            return self.label[idx], self.numerical_columns[idx], self.cat_columns[idx], image
1 Like