Hello,
I am working on trying to run the training module from (traini3d.py) : https://github.com/piergiaj/pytorch-i3d on my own dataset of videos (they do it for Charades).
With some help, I have successfully gotten a Dataset module functioning to extract features. Now, in the training module, I have a dimension issue here:
per_frame_logits = i3d(inputs)
# upsample to input size
print(per_frame_logits)
per_frame_logits = F.upsample(per_frame_logits, t, mode='linear')
# compute localization loss
loc_loss = F.binary_cross_entropy_with_logits(per_frame_logits, labels)
The labels are of dimension 1 but the per frame logits are of dimension 1, num_classes, frames per second (so in my case 1,2,24 [just for testing purposes I am using 2 classes]. I am unsure as to how to change the dimension of the per frame logits. Here is the Dataset module I use:
import numpy
from pathlib import Path
import torch.utils.data as data_utl
class customVideoDataset(data_utl.Dataset):
def __init__(self, path, frame_count, transforms = None):
self.videos = []
self.labels = []
self.transforms = transforms
self.frames = frame_count
folder = Path(path)
for label in sorted(os.listdir(folder)):
for fname in os.listdir(os.path.join(folder, label)):
self.videos.append(os.path.join(folder, label, fname))
self.labels.append(label)
#for fname in sorted(os.listdir(folder)):
# self.videos.append(os.path.join(folder, fname))
self.label2index = {label: index for index, label in enumerate(sorted(set(self.labels)))}
self.index2label = {index: label for index, label in enumerate(sorted(set(self.labels)))}
self.label_array = [self.label2index[label] for label in self.labels]
def __getitem__(self, idx):
video = cv2.VideoCapture(self.videos[idx])
stacked_frames = numpy.empty(shape=(self.frames, 224, 224, 3),
dtype=numpy.dtype('float32')) # as frame would have shape h,w,channels
frame_count = 0
while video.isOpened() and frame_count<self.frames:
ret, frame = video.read()
if not ret:
break
frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
frame = cv2.resize(frame, (224, 224))
stacked_frames[frame_count] = frame
frame_count += 1
video.release()
if self.transforms:
stacked_frames = self.transforms(stacked_frames)
stacked_frames = torch.from_numpy(stacked_frames.transpose((3, 0, 1, 2)))
return stacked_frames, self.label_array[idx]
def __len__(self):
length = len(self.videos)
return length
def namer(self):
return self.index2label
def get_vids(self):
names = []
for name in self.videos:
names.append(name)
return names
Does someone know where the dimension issue arises from?