Hi everyone, I’m trying to practice with RNN doing the task of action detection in untrimmed video. Basically, the task is to predict each frame of the video as background_class or action_class, i.e. localizing when the actions occur within the video.
The model is a simple CNN and the features extracted are passed to an LSTM that performs the prediction at every timestep.
The problem is that my model is not able to completely overfit a single video sample(the loss doesn’t go any lower than about 2.0 after 80 epochs, should it go down a little bit more?).
Is there a problem in my model? Maybe in the forward pass? If not, any suggestions on how to improve or change the model?
class Flatten(nn.Module):
def __init__(self):
super(Flatten, self).__init__()
def forward(self, x):
return x.view(x.shape[0], -1)
class VideoModel(nn.Module):
def __init__(self, hidden_dim=512, num_classes=22, dtype=torch.float32):
super().__init__()
self.hidden_dim = hidden_dim
self.dtype = dtype
self.num_classes = num_classes
self.feat_extr = models.resnet18(pretrained=True)
FEAT_VECT_DIM = 512 # should not be hard-coded. TODO
self.feat_extr = nn.Sequential(
*list(self.feat_extr.children())[:-1],
Flatten()
)
for param in self.feat_extr.parameters():
param.requires_grad = False
self.lin_transf = nn.Sequential(
nn.Linear(FEAT_VECT_DIM, FEAT_VECT_DIM//2),
)
self.lstm = nn.LSTMCell(FEAT_VECT_DIM//2, hidden_dim)
self.classifier = nn.Linear(hidden_dim, num_classes)
def forward(self, x):
# x.shape == (batch_size, frames_per_sample, 3, 180, 320)
h_n = torch.zeros(x.shape[0], self.hidden_dim, dtype=self.dtype, device=x.device)
c_n = torch.zeros(x.shape[0], self.hidden_dim, dtype=self.dtype, device=x.device)
scores = torch.zeros(x.shape[0], x.shape[1], self.num_classes, dtype=self.dtype, device=x.device)
for step in range(x.shape[1]):
x_t = x[:, step]
out = self.feat_extr(x_t)
out = self.lin_transf(out)
h_n, c_n = self.lstm(out, (h_n, c_n))
scores[:, step, :] = self.classifier(h_n) # (batch_size, num_classes)
return scores
A simple version of the code to replicate my results can be found here: https://colab.research.google.com/drive/1X17NfHl7ZXCVnrfnNMYkIz6XHClYr7PH?usp=sharing