Hello everybody,
I have a video dataset. I extract one video image frame and extract on audio spectrum as image of the video. I have two main folders -one includes video image frames and the other contains audio spectrums of each videos-. Each two main folder have 8 subfolders - which are the classes.
My model has two inputs -one image frame and one audio spectrum image-. Each input is transferred a pretrained model vgg16 paralelly for feature extraction. Then, result of these two inputs are concatinated into 8192 linear and then transferred the classification step.
So firstly, I create a dataset that combines these two dataset -one image frames, the other audio spectrum images-. So that, each same videos of image frame and audio spectrum images are matched for dataloader.
transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor()])
class ConcatDataset(torch.utils.data.Dataset):
def __init__(self, firstDataset, secondDataset):
self.firstDataset = firstDataset #video image frames of the videos
self.secondDataset = secondDataset #audio image spectrum of videos
def __getitem__(self, i):
f_x, f_y = self.firstDataset[i] #f_x: video frame image, f_y: class of the f_x
s_x, s_y = self.secondDataset[i] #s_x: audio spectrum image , s_y: class of the s_x
#I checked that f_x and s_x are came from the same video
return {"videoFrame": f_x, "audioImage":s_x}, f_y
def __len__(self):
return len(self.firstDataset)
train_loader = DataLoader(ConcatDataset(datasets.ImageFolder(root=OUTPUT_DIR_OF_VIDEO_IMAGES, transform=transform),
datasets.ImageFolder(root=OUTPUT_DIR_OF_SOUND_SPECTOGRAMS, transform=transform)),
batch_size=batch_size, shuffle=True)
My model is
vggmodel = vgg16(weights=torchvision.models.VGG16_Weights.DEFAULT)#(pretrained=True)
for param in vggmodel.features.parameters():
param.require_grad = False
class MyModel(nn.Module):
def __init__(self):
super().__init__()
m = vggmodel
for param in m.parameters():
param.requires_grad = False
m.classifier[6] = nn.Identity() # replaced final FC layer with identity
self.vgg16_modified = m
self.classifier_last = nn.Sequential(
nn.Linear(8192, 256),
nn.ReLU(),
nn.Dropout(0.4),
nn.Linear(256, 8),
nn.LogSoftmax(dim=1))
def forward(self, x):
y1 = self.vgg16_modified(x["videoFrame"])
y2 = self.vgg16_modified(x["audioImage"])
y = torch.concat((y1, y2), 1) #IS IT TRUE ALSO?
return self.classifier_last(y)
model = MyModel()
print(model)
This is my model
When I trained, I got low accuracy. I just wondered that my logic is correct. If it is maybe my dataset images are not suitable for this case.
First epoch accuray: 20.000000298023224
Second one: 0.0
Third epoch: :20.000000298023224
Forth one: 60.00000238418579
Fifth one: 20.000000298023224
Thanks for everything,
Best regards.