I created a custom video-text dataset class for my video and text dataset in pytorch. The dataset consist of video-text pairs, where each video has already been converted into frames and the frames are of varying length (video frames located in separate folders) and the corresponding text of all the videos are stored in a text file, line by line. The url for all the videos are contained in another text file.
I initialized the dataset and load to dataloaders, choosing a batch size of 16. Using it in my training loop, it gives me 16 pairs of video-text pairs, but with a fixed size of 16 for all the frames. However, the number of frames in each folder is different, ranging from 30 to over 190.
Here is the code for my video text dataset.
class VideoTextDataset(Dataset):
def __init__(self, root_dir, split, transform=None, features_dir='/home/vid-txt'):
self.root_dir = root_dir
self.split = split
self.transform = transform
self.text_data, self.video_paths = self._load_data()
self.features_dir = features_dir
def __len__(self):
return len(self.video_paths)
def __getitem__(self, idx):
video_path = os.path.join(self.features_dir, self.video_paths[idx])
frames = []
for frame_file in os.listdir(video_path):
if frame_file.endswith('.png'):
frame_path = os.path.join(video_path, frame_file)
frame = Image.open(frame_path)
if self.transform:
frame = self.transform(frame)
frames.append(frame)
text = self.text_data[idx]
return frames, text
def _load_data(self):
text_file = os.path.join(self.root_dir, 'phoenix2014T.{}.de'.format(self.split))
with open(text_file, 'r') as f1:
text_data = [line.strip() for line in f1.readlines()]
sign_file = os.path.join(self.root_dir, 'phoenix2014T.{}.sign'.format(self.split))
with open(sign_file, 'r') as f2:
video_paths = [line.strip() for line in f2.readlines()]
return text_data, video_paths
transformss = transforms.Compose([transforms.Resize((224, 224)), transforms.ToTensor()])
train_dataset = VideoTextDataset(root_dir='/home/vid-txt/Data', split='train', transform=transformss)
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
for epoch in range(num_epochs):
total_loss = 0.0
video_encoder.train()
for batch_idx, (video_frames, sentences) in enumerate(train_dataloader):
print(len(video_frames), video_frames.size())
print(sentences[0], len(sentences))
video_frames = torch.stack(video_frames).to(device)
sentences = sentences.to(device)
Above also contains some part of the training loop. Printing len(video_frames) outputs 16 for every run I make. My issue here is how to make the dataloader to load the batches with the full number of frames.