Can we input a continuous video which contains sequences of both positive classes and negative classes to train LSTM, on several thousand of such videos? My overall objective is to mark videos realtime with particular scenes(e.g. if I’ve 0-100 frames and frame number 30-60 contains some yoga scenes, I need to mark them)
Right now the approach which I’m following is to split the video into two categories of positive sequences and negative sequences and train LSTM (on top of Mobnet CNN, FC replaced by LSTM layers). But somehow this does not give any improvement compared to Mobnet alone when we run evaluation on non-split videos.
Both Mobnet and LSTM are trained separately. I save output of Mobnet(FC removed) in numpy arrays and then read these arrays for training LSTM.
Here is the sample of code used for this approach:
epochs = 250
batch_size = 128
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
in_size = 1024
classes_no = 2
hidden_size = 512
layer_no = 2
self.lstm = nn.LSTM(in_size, hidden_size, layer_no, batch_first=True)
self.linear = nn.Linear(hidden_size, classes_no)
def forward(self, input_seq):
output_seq, _ = self.lstm(input_seq)
last_output = output_seq[:,-1]
class_predictions = self.linear(last_output)
return class_predictions
def nploader(npfile):
a = np.load(npfile)
return a
def train():
npdataloader = torchvision.datasets.DatasetFolder('./featrs/',
nploader, ['npy'], transform=None, target_transform=None)
data_loader = torch.utils.data.DataLoader(npdataloader,
batch_size=batch_size,
shuffle=False,
num_workers=1)
model = Model().cuda()
loss = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr = 0.001)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=100, gamma=0.8)
model.train()
for epoch in range(0, epochs):
for input_seq, target in data_loader:
optimizer.zero_grad()
output = model(input_seq.cuda())
err = loss(output.cuda(), target.cuda())
err.backward()
optimizer.step()
scheduler.step()
torch.save(model.state_dict(), 'lstm.ckpt')