RNN is not able to overfit a single sample

Hi everyone, I’m trying to practice with RNN doing the task of action detection in untrimmed video. Basically, the task is to predict each frame of the video as background_class or action_class, i.e. localizing when the actions occur within the video.
The model is a simple CNN and the features extracted are passed to an LSTM that performs the prediction at every timestep.
The problem is that my model is not able to completely overfit a single video sample(the loss doesn’t go any lower than about 2.0 after 80 epochs, should it go down a little bit more?).
Is there a problem in my model? Maybe in the forward pass? If not, any suggestions on how to improve or change the model?

class Flatten(nn.Module):
    def __init__(self):
        super(Flatten, self).__init__()

    def forward(self, x):
        return x.view(x.shape[0], -1)

class VideoModel(nn.Module):
  def __init__(self, hidden_dim=512, num_classes=22, dtype=torch.float32):
    super().__init__()
    self.hidden_dim = hidden_dim
    self.dtype = dtype
    self.num_classes = num_classes

    self.feat_extr = models.resnet18(pretrained=True)
    FEAT_VECT_DIM = 512  # should not be hard-coded. TODO
    self.feat_extr = nn.Sequential(
        *list(self.feat_extr.children())[:-1],
        Flatten()
    )
    for param in self.feat_extr.parameters():
      param.requires_grad = False

    self.lin_transf = nn.Sequential(
        nn.Linear(FEAT_VECT_DIM, FEAT_VECT_DIM//2),
    )
    
    self.lstm = nn.LSTMCell(FEAT_VECT_DIM//2, hidden_dim)
    self.classifier = nn.Linear(hidden_dim, num_classes)

  def forward(self, x):
    # x.shape == (batch_size, frames_per_sample, 3, 180, 320)
    h_n = torch.zeros(x.shape[0], self.hidden_dim, dtype=self.dtype, device=x.device)
    c_n = torch.zeros(x.shape[0], self.hidden_dim, dtype=self.dtype, device=x.device)
    scores = torch.zeros(x.shape[0], x.shape[1], self.num_classes, dtype=self.dtype, device=x.device)
    for step in range(x.shape[1]):
      x_t = x[:, step]
      out = self.feat_extr(x_t)
      out = self.lin_transf(out)
      h_n, c_n = self.lstm(out, (h_n, c_n))    
      scores[:, step, :] = self.classifier(h_n)     # (batch_size, num_classes)
    return scores

A simple version of the code to replicate my results can be found here: https://colab.research.google.com/drive/1X17NfHl7ZXCVnrfnNMYkIz6XHClYr7PH?usp=sharing