Dataset for video is slow

Hi! I’m implementing a dataset that loads data from videos. I am following the example from X3D pytorch, which use EncodedVideo to load a video and get_clip to segment it to a clip.

class MyDataset(Dataset):
    def __init__(self, data_root):
        csv_list = get_csv_list(data_root)
        action_list = create_action_list(csv_list)
        self.data = self._prepare_data(action_list)   # This function returns a list of [
                                                      #  [path, label, start, end],
                                                      #  [path, label, start, end], ...]
    
    def _prepare_data(self, action_list):
        data = []
        for action in action_list:
            filename = action['filename']
            label = action['label']
            start, end = time2sec([action['start'], action['end']])
            data.append([filename, label, start, end])
        return data

    def __getitem__(self, idx):
        filename, label, start, end = self.data[idx]
        start_sec = 0
        end_sec = clip_duration + start_sec
        video_data = EncodedVideo.from_path(filename)
        video_data = video_data.get_clip(start_sec=start_sec, end_sec=end_sec)
        input = video_data["video"]

        return label, input
    
    def __len__(self):
        return len(self.data)

clip_duration = 5
dataset = MyDataset(data_root)
for data in dataset:
    label, input = data

The get_clip call is very expensive (1~1.5s when the clip_duration is 5). The memory usage is also high, and for some reason the RAM usage doesn’t drop when the code stops (I’m using Colab and the session crashes after a few iterations).
Is there a better way to implement a Dataset (and DataLoader) for datasets contaning videos?