Defining “processed_file_names()” for my own torch_geometric.data.Dataset

I am trying to build my graph CNN model with PyTorch Geometric, but before really working on the model itself, I have found some problems on constructing my dataset with torch_geometric.data.Dataset.

The official doc “Creating Your Own Datasets” gives an example:

class MyOwnDataset(Dataset):
    def __init__(self, root, transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)

    @property
    def raw_file_names(self):
        return ['some_file_1', 'some_file_2', ...]

    @property
    def processed_file_names(self):
        return ['data_1.pt', 'data_2.pt', ...]

    ### and more...

and says that users need to implement the following methods:

torch_geometric.data.InMemoryDataset.raw_file_names():

A list of files in the raw_dir which needs to be found in order to skip the download.

torch_geometric.data.InMemoryDataset.processed_file_names():

A list of files in the processed_dir which needs to be found in order to skip the processing.

In my situation, I have my local dataset so I can prepare all the data paths. So, I tried to make my own dataset as:

class MyOwnDataset(Dataset):
    # I pass my data paths as parameter
    def __init__(self, root, input_data_paths, transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)
        self.input_data_paths = input_data_paths

    def _download(self):
        # Don't need to download
        pass

    @property
    def raw_file_names(self):
        # Try to pass my paths here (but got error!)
        return self.input_data_paths

    @property
    def processed_file_names(self):
        # Try to replicate names defined in process()
        return [ 'data_{}.pt'.format(i) for i in range(len(self.raw_paths)) ]

    def __len__(self):
        return len(self.processed_file_names)

    def process(self):
        i = 0
        for raw_path in self.raw_paths:
            # Test dummy data
            data = torch.tensor([1,1,1])

            torch.save(data, ops.join(self.processed_dir, 'data_{}.pt'.format(i)))
            i += 1

    def get(self, idx):
        data = torch.load(osp.join(self.processed_dir, 'data_{}.pt'.format(idx)))
        return data

# Test dummy paths
input_paths = ["./p1", "./p2"]

# Create dataset
train_dataset = MyOwnDataset("/tmp/Data/train/", input_paths)

Unfortunately, I got an AttributeError:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-17-d69608eea4a9> in <module>
      4 # Create dataset
      5 train_params = {'batch_size':1, 'shuffle': True, 'num_workers': 1}
----> 6 train_dataset = MyOwnDataset("/tmp/Data/train/", input_paths)

<ipython-input-16-a9fe3ae7ad91> in __init__(self, root, input_data_paths, transform, pre_transform)
      2     def __init__(self, root, input_data_paths, 
      3                  transform=None, pre_transform=None):
----> 4         super(MyOwnDataset, self).__init__(root, transform, pre_transform)
      5         self.input_data_paths = input_data_paths
      6 

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in __init__(self, root, transform, pre_transform, pre_filter)
     81 
     82         self._download()
---> 83         self._process()
     84 
     85     @property

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in _process(self)
    119 
    120     def _process(self):
--> 121         if files_exist(self.processed_paths):  # pragma: no cover
    122             return
    123 

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in processed_paths(self)
    108         r"""The filepaths to find in the :obj:`self.processed_dir`
    109         folder in order to skip the processing."""
--> 110         files = to_list(self.processed_file_names)
    111         return [osp.join(self.processed_dir, f) for f in files]
    112 

<ipython-input-16-a9fe3ae7ad91> in processed_file_names(self)
     17     def processed_file_names(self):
     18         # Try to replicate names defined in process()
---> 19         return [ 'data_{}.pt'.format(i) for i in range(len(self.raw_paths)) ]
     20 
     21     def __len__(self):

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in raw_paths(self)
    101     def raw_paths(self):
    102         r"""The filepaths to find in order to skip the download."""
--> 103         files = to_list(self.raw_file_names)
    104         return [osp.join(self.raw_dir, f) for f in files]
    105 

<ipython-input-16-a9fe3ae7ad91> in raw_file_names(self)
     12     @property
     13     def raw_file_names(self):
---> 14         return self.input_data_paths
     15 
     16     @property

AttributeError: 'MyOwnDataset' object has no attribute 'input_data_paths'

Unlike the official example which defines the raw_file_names explicitly (e.g. [‘some_file_1’, ‘some_file_2’, …]), I want to reuse this Dataset class for several of my datasets so the paths are preferably variables.

Therefore, here comes my question: is there any way to define raw_file_names dynamically?

Besides, I have one more question on defining processed_file_names. I tried to define the names twice (in processed_file_names() and process()) but in the same way so the names match with each other, and I do not believe this would be the best practice. Is there any way to define them dynamically, e.g. depends on input paths?

Sorry for making such a lengthy question and thank you in advance for the help.

1 Like

I’m not hugely familiar with the @property decorator in Python, but from your error, I conclude that .__init__() is called after the property is initiated. Therefore, self.input_data_paths does not exist and you get your AttributeError.

From tutorials I’ve found (for example this one), you need property setter methods to define properties dynamically.

1 Like

Thank you alex.veuthey!

While referencing the example you provided, I modified may Dataset class as following:

class MyOwnDataset(Dataset):
    def __init__(self, root, input_data_paths, 
                 transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)
        # Change to private variable
        #self.input_data_paths = input_data_paths     # original line
        self._input_data_paths = input_data_paths

        
    def _download(self):
        # Don't need to download
        pass
    
    @property
    def raw_file_names(self):
        # Change to private variable 
        # return self.input_data_paths    # original line
        return self.input_data_paths

    # My property setter here
    @raw_file_names.setter
    def raw_file_names(self, value):
        self.raw_file_names = value

    # no more changes...

But with no luck, similar error was returned:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-6-d69608eea4a9> in <module>
      4 # Create dataset
      5 train_params = {'batch_size':1, 'shuffle': True, 'num_workers': 1}
----> 6 train_dataset = MyOwnDataset("/tmp/Data/train/", input_paths)

<ipython-input-5-c37d75e54a81> in __init__(self, root, input_data_paths, transform, pre_transform)
      2     def __init__(self, root, input_data_paths, 
      3                  transform=None, pre_transform=None):
----> 4         super(MyOwnDataset, self).__init__(root, transform, pre_transform)
      5         #self.input_data_paths = input_data_paths
      6         self._input_data_paths = input_data_paths

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in __init__(self, root, transform, pre_transform, pre_filter)
     81 
     82         self._download()
---> 83         self._process()
     84 
     85     @property

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in _process(self)
    119 
    120     def _process(self):
--> 121         if files_exist(self.processed_paths):  # pragma: no cover
    122             return
    123 

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in processed_paths(self)
    108         r"""The filepaths to find in the :obj:`self.processed_dir`
    109         folder in order to skip the processing."""
--> 110         files = to_list(self.processed_file_names)
    111         return [osp.join(self.processed_dir, f) for f in files]
    112 

<ipython-input-5-c37d75e54a81> in processed_file_names(self)
     22     def processed_file_names(self):
     23         # Try to replicate names defined in process()
---> 24         return [ 'data_{}.pt'.format(i) for i in range(len(self.raw_paths)) ]
     25 
     26     def __len__(self):

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in raw_paths(self)
    101     def raw_paths(self):
    102         r"""The filepaths to find in order to skip the download."""
--> 103         files = to_list(self.raw_file_names)
    104         return [osp.join(self.raw_dir, f) for f in files]
    105 

<ipython-input-5-c37d75e54a81> in raw_file_names(self)
     13     @property
     14     def raw_file_names(self):
---> 15         return self._input_data_paths
     16     @raw_file_names.setter
     17     def raw_file_names(self, value):

AttributeError: 'MyOwnDataset' object has no attribute '_input_data_paths'

I am still trying to figure out how properties are handled in torch_geometric.data.Dataset classes.

1 Like

It’s closer, but not quite there yet. Your problem right now is that self.input_data_paths is never initialized when you call raw_file_names(). Also, the property name should be the same as what you call in the .__init__().

class MyOwnDataset(Dataset):
    def __init__(self, root, input_data_paths, 
                 transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)
        # same name as property
        self.input_data_paths = input_data_paths

    def _download(self):
        pass
    
    @property
    def raw_file_names(self):
        # hidden variable
        return self.__input_data_paths

    @raw_file_names.setter
    def raw_file_names(self, value):
        # affect value to hidden variable, this is called in the __init__ method
        self.__input_data_paths = value

Let me know if this works!

I am sorry to tell that it does not work:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-25-d69608eea4a9> in <module>
      4 # Create dataset
      5 train_params = {'batch_size':1, 'shuffle': True, 'num_workers': 1}
----> 6 train_dataset = MyOwnDataset("/tmp/Data/train/", input_paths)

<ipython-input-24-f615466d757b> in __init__(self, root, input_data_paths, transform, pre_transform)
      2     def __init__(self, root, input_data_paths, 
      3                  transform=None, pre_transform=None):
----> 4         super(MyOwnDataset, self).__init__(root, transform, pre_transform)
      5         self.input_data_paths = input_data_paths
      6 

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in __init__(self, root, transform, pre_transform, pre_filter)
     81 
     82         self._download()
---> 83         self._process()
     84 
     85     @property

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in _process(self)
    119 
    120     def _process(self):
--> 121         if files_exist(self.processed_paths):  # pragma: no cover
    122             return
    123 

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in processed_paths(self)
    108         r"""The filepaths to find in the :obj:`self.processed_dir`
    109         folder in order to skip the processing."""
--> 110         files = to_list(self.processed_file_names)
    111         return [osp.join(self.processed_dir, f) for f in files]
    112 

<ipython-input-24-f615466d757b> in processed_file_names(self)
     21     def processed_file_names(self):
     22         # Try to replicate names defined in process()
---> 23         return [ 'data_{}.pt'.format(i) for i in range(len(self.raw_paths)) ]
     24 
     25     def __len__(self):

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in raw_paths(self)
    101     def raw_paths(self):
    102         r"""The filepaths to find in order to skip the download."""
--> 103         files = to_list(self.raw_file_names)
    104         return [osp.join(self.raw_dir, f) for f in files]
    105 

<ipython-input-24-f615466d757b> in raw_file_names(self)
     12     @property
     13     def raw_file_names(self):
---> 14         return self.__input_data_paths
     15     @raw_file_names.setter
     16     def raw_file_names(self, value):

AttributeError: 'MyOwnDataset' object has no attribute '_MyOwnDataset__input_data_paths'

I am not very clear about what you mean by “same name as property” in init() function. Do you mean I should change my variable name “input_data_paths” to “raw_file_names”, like this?

class MyOwnDataset(Dataset):
    def __init__(self, root, input_data_paths, 
                 transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)
        self.raw_file_names = input_data_paths

    @property
    def raw_file_names(self):
        return self.__raw_file_names

    @raw_file_names.setter
    def raw_file_names(self, value):
        self._raw_file_names = value

Still, the same error.

AttributeError: 'MyOwnDataset' object has no attribute '_MyOwnDataset__raw_file_names'

You have a typo! There should be two underscores before raw_file_names in the setter, not just one. It should work if you correct this (it works on my machine at least).

Oh. I am sorry for the stupid mistake.

I made the change here:

class MyOwnDataset(Dataset):
    def __init__(self, root, input_data_paths, 
                 transform=None, pre_transform=None):
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)
        self.raw_file_names = input_data_paths
    
    @property
    def raw_file_names(self):
        return self.__raw_file_names
    @raw_file_names.setter
    def raw_file_names(self, value):
        self.__raw_file_names = value

But still doesn’t work, with the same old error:

---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-36-d69608eea4a9> in <module>
      4 # Create dataset
      5 train_params = {'batch_size':1, 'shuffle': True, 'num_workers': 1}
----> 6 train_dataset = MyOwnDataset("/tmp/Data/train/", input_paths)

<ipython-input-35-8202755ead20> in __init__(self, root, input_data_paths, transform, pre_transform)
      2     def __init__(self, root, input_data_paths, 
      3                  transform=None, pre_transform=None):
----> 4         super(MyOwnDataset, self).__init__(root, transform, pre_transform)
      5         self.raw_file_names = input_data_paths
      6 

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in __init__(self, root, transform, pre_transform, pre_filter)
     81 
     82         self._download()
---> 83         self._process()
     84 
     85     @property

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in _process(self)
    119 
    120     def _process(self):
--> 121         if files_exist(self.processed_paths):  # pragma: no cover
    122             return
    123 

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in processed_paths(self)
    108         r"""The filepaths to find in the :obj:`self.processed_dir`
    109         folder in order to skip the processing."""
--> 110         files = to_list(self.processed_file_names)
    111         return [osp.join(self.processed_dir, f) for f in files]
    112 

<ipython-input-35-8202755ead20> in processed_file_names(self)
     21     def processed_file_names(self):
     22         # Try to replicate names defined in process()
---> 23         return [ 'data_{}.pt'.format(i) for i in range(len(self.raw_paths)) ]
     24 
     25     def __len__(self):

/usr/local/lib/python3.6/dist-packages/torch_geometric/data/dataset.py in raw_paths(self)
    101     def raw_paths(self):
    102         r"""The filepaths to find in order to skip the download."""
--> 103         files = to_list(self.raw_file_names)
    104         return [osp.join(self.raw_dir, f) for f in files]
    105 

<ipython-input-35-8202755ead20> in raw_file_names(self)
     12     @property
     13     def raw_file_names(self):
---> 14         return self.__raw_file_names
     15     @raw_file_names.setter
     16     def raw_file_names(self, value):

AttributeError: 'MyOwnDataset' object has no attribute '_MyOwnDataset__raw_file_names'

Can you share the codes tested on your machine, if they are not entirely the same as those above?

That’s odd, I’m using the same code but slightly simplified for imports:

class MyOwnDataset():
    def __init__(self, input_data_paths):
        self.raw_file_names = input_data_paths
    
    @property
    def raw_file_names(self):
        return self.__raw_file_names

    @raw_file_names.setter
    def raw_file_names(self, value):
        self.__raw_file_names = value

ds = MyOwnDataset('asdf')
print(ds.raw_file_names)
# >> 'asdf'
1 Like

It seems that it is really related to how the torch_geometric.data.Dataset class handles the stuff. Hope there will be some more replies to this thread and bringing in some more new ideas.

Thank you so much for you help!

1 Like

Try setting the members before calling the constructor of the base class:

class MyOwnDataset(Dataset):
    def __init__(self, root, input_data_paths, 
                 transform=None, pre_transform=None):
        self.raw_file_names = input_data_paths
        super(MyOwnDataset, self).__init__(root, transform, pre_transform)