I am trying to replicate a model from matlab and it does not learn

I am trying to replicate this model from matlab but for some reason, my loss function does not decrease a lot and stays near 1.80 (initial value) - model
Here is my code of the model and the training-accuracy loop:

class go(nn.Module):
  def __init__(self):
    super(go, self).__init__()
    self.model_go = torch.hub.load('pytorch/vision:v0.10.0', 'googlenet', pretrained=True)

  def forward(self, x):
    #feature extraction
    self.output = create_feature_extractor(self.model_go, return_nodes={'avgpool':0}) #return 10th layer as in paper
    self.val = self.output(x.float())['0']
    #flatten
    #self.output = torch.flatten(self.output)
    return self.val
class lstm(nn.Module):
  def __init__(self, input_size = 1024, hidden_size = 2000, num_layers = 2, bidirectional = True, n_class=2):
    super(lstm, self).__init__()
    self.input_size = input_size
    self.hidden_size = hidden_size
    self.num_layers = num_layers
    self.bidirectional = bidirectional
    self.n_class = n_class
    self.model_lstm = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, num_layers=self.num_layers , bidirectional=self.bidirectional, batch_first=True)
    self.hidden_state = None
    
    self.output_layer = nn.Sequential(
            nn.Linear(2 * self.hidden_size if self.bidirectional==True else self.hidden_size, self.n_class)
        )
    self.fc = nn.Sequential(
      nn.Linear(self.n_class, self.n_class),
      nn.Dropout(p=0.5, inplace=False)
    )
  def reset_hidden_state(self):
    self.hidden_state = None

  def forward(self, x):
    lstm_output, hidden_state = self.model_lstm(x, self.hidden_state)
    output = self.output_layer(lstm_output) #torch.Size([12, 30, 1024]) -> torch.Size([12, 30, 2])
    output = output[:, -1, :] #12 x n_class
    output = self.fc(output)
    return output
    #return torch.sigmoid(output)
modela = go().to('cpu')
modelb = lstm(n_class = len(new_categories_names)).to(device)
#modelb.load_state_dict(torch.load('/content/gdrive/My Drive/ColabNotebooks/human_activity_recognition/final_model1_train_test.pth'))
## Loss and optimizer
criterion = nn.CrossEntropyLoss()

#criterion = nn.BCELoss()
optimizer = optim.Adam(modelb.parameters(), lr=0.0001)
lr_scheduler = ReduceLROnPlateau(optimizer, mode='min',factor=0.5, patience=5,verbose=1)
for epoch in range(3):
  modelb.train()
  loss_epoch = 0

  #iterate over the videos
  for batch_idx, (_, _) in enumerate(zip(video_batches_train, label_batches_train)):
    print(batch_idx)
    inputs_train, targets_train = create_inputs(video_batches_train[batch_idx], label_batches_train[batch_idx], num_frames = 30) #get data -> torch.Size([12, 30, 3, 224, 224]), torch.Size([12, 6])
    num_vids = inputs_train.shape[0]
    # extract features of batch
    features = []
    #14 per batch
    for input in inputs_train:
      frms = []
      #video's frames, 30 per video
      for fr in input:
        frms.append(modela(fr.unsqueeze(0).to('cpu'))) #torch.Size([1, 1024, 1, 1])
      features.append(torch.cat(frms)) #torch.Size([30, 1024, 1, 1])
      del frms
    features = torch.stack(features) #torch.Size([12, 30, 1024, 1, 1])
    del inputs_train
    optimizer.zero_grad()
    modelb.reset_hidden_state()
    scores = modelb(features.view(num_vids, frames, -1).to(device)) #cuda, torch.Size([12, 30, 1024]) -> torch.Size([12, 2]),
    del features
    #loss = criterion(scores[:, -1, :],targets_train.max(1)[1].to(device))
    loss = criterion(scores,targets_train.max(1)[1].to(device))
    del targets_train
    loss.backward()
    optimizer.step()
    loss_epoch += loss.item()
    print(f"mini in epoch {epoch} :::: {loss.item()}") 

  #validation after each epoch
  with torch.no_grad():
    modelb.eval
    features = []
    #create test features
    inputs_test, targets_test = create_inputs(video_batches_test[0], label_batches_test[0], num_frames = 30) #only 1 batch for validation
    num_vids = inputs_test.shape[0] 
    for input_test in inputs_test:
      frms = []
      #video's frames, 40 per video
      for fr in input_test:
        frms.append(modela(fr.unsqueeze(0).to('cpu')))
      features.append(torch.cat(frms))
      del frms
    features = torch.stack(features) #20batchesx(14x20x3x224x224)
    del inputs_test

    scores = modelb(features.view(num_vids, frames, -1).to(device)) #cuda
    total = 100*(torch.nn.functional.softmax(scores).max(1)[1] == targets_test.to('cuda').max(1)[1]).sum()
    print('Accuracy: ', total/targets_test.shape[0])
    del scores, features, targets_test

  #reshuffle after each epoch
  x = np.array(video_batches_train).flatten()
  y = np.array(label_batches_train).reshape((-1, label_batches_train[0].shape[1]))
  x, y = shuffle(x, y)
  video_batches_train, label_batches_train = create_batches(x, y, num_batches = 20)
  del x,y
  lr_scheduler.step(total)
  del total
  print(f"Loss in epoch {epoch} :::: {loss_epoch/len(video_batches_train)}") 

The only difference is that my batch size is 12 instead of 16 and I am using 30 frames instead of 40 frames due to RAM issues.
Is there anything that I am missing?