I am trying to create a video recognition model and I got aware that the most difficult part i to create an efficient DataLoader and DataSet for different lengths videos.
I made my DataSet like this:
import torch
import torchvision as tv
import cv2
from PIL import Image
import numpy as np
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
from pathlib import Path
class VideoLoader(torch.utils.data.Dataset):
def __init__(self, data_path, classes, transforms=None, max_frames=None, frames_ratio=None):
super(VideoLoader, self).__init__()
self.data_path = data_path
self.classes = classes
self.frames_ratio = frames_ratio
self.transforms = transforms
self.max_frames = max_frames
def read_video(self, path):
frames = []
vc = cv2.VideoCapture(path)
total_frames = int(vc.get(cv2.CAP_PROP_FRAME_COUNT))
if self.frames_ratio:
if type(self.frames_ratio) is float:
frames_to_pick = int(total_frames * self.frames_ratio)
else:
frames_to_pick = self.frames_ratio
else:
frames_to_pick = total_frames
idxs = np.linspace(0, total_frames, frames_to_pick, endpoint=False)
for i in idxs:
ok, f = vc.read()
if ok:
f = self.transforms(f) if self.transforms else f
frames.append(f)
vc.set(cv2.CAP_PROP_POS_FRAMES, i)
if self.max_frames and len(frames) == self.max_frames: break
else: break
vc.release()
return frames
def __getitem__(self, index):
v_path, label = self.data_path[index]
return torch.stack(self.read_video(v_path)), self.classes[label]
def __len__(self): return len(self.data_path)
And here the corresponding DataLoader:
def pad_frames(batch):
labels = []
sequences = []
for b, l in batch:
labels.append(l)
sequences.append(b)
packed_batch = torch.nn.utils.rnn.pack_sequence(sequences, enforce_sorted=False)
return packed_batch, torch.tensor(labels)
train_dl = torch.utils.data.DataLoader(train_vl, batch_size=batch_size, shuffle=True, collate_fn=pad_frames)
I was wondering if there was any more efficient way to pad the videos of different lenght as this appears to be the main bottleneck of my data loading. Any ideas?