Why is my MLP-GRU model validation accuracy is so low?

My task is action recognition where my inputs are mediapipe keypoints. I have 300 imbalanced classes. The first few classes have over 500 samples while the majority have less than 20 so I am using WeightedRandomSampler to deal with this imbalance. But even after that, my training and validation accuracy are still low.

I have tried the model below which is an MLP plus GRU model. My learning rate is 1e-06, learning step is 10, batch size is 32. After around 10 epochs, the accuracy is not reaching 1% and the train and val losses plateau.

class MLP(nn.Module):
  def __init__(self, num_classes, rnn_layers, hidden_size, fc_size):
    super(MLP, self).__init__()
    self.hidden_size = hidden_size
    self.num_classes = num_classes
    self.fc_size = fc_size

    self.fc_pre= nn.Sequential(
                            nn.Linear(201, 100),
                           nn.Linear(100, fc_size),
    #Defines the number of features that define each element (time-stamp) of the input sequence
    self.rnn = nn.GRU(input_size = 150,
                hidden_size = 50,
                num_layers = 2,
                batch_first = True)
    self.fc = nn.Linear(hidden_size, num_classes)

  # initialize hidden state for RNN  
  def init_hidden(self, num_layers, batch_size):
             return (torch.zeros(num_layers, batch_size, self.hidden_size).cuda(),
                     torch.zeros(num_layers, batch_size, self.hidden_size).cuda())
   #inital hidden state made of zeros
  def forward(self, inputs, hidden=None, steps=0):
        length = len(inputs)
        # print('inputs size')
        # print(inputs[3].size())
        #Input data: RNN should have 3 dimensions. (Batch Size, Sequence Length and Input Dimension (the number of expected features which is 201)
        fs = torch.zeros(inputs[0].size(0), length, self.rnn.input_size).cuda()

        for i in range(length):
            f = inputs[i]
            #flattens the tensor
            f = f.view(f.size(0), -1)
            f = self.fc_pre(f)

            # stores the tensor f in the sha pe [fs.size(0), fs.size(2)] to each “row” in fs to prepare for learning the sequence in RNN
            fs[:, i, :] = f
            # print('fs')
            # print(fs.size())
        #outputs : batch size, seq legnth, hidden size
        outputs, hidden = self.rnn(fs, hidden)
        #the training code applies crossentropyloss as criterion which also applies softmax to the output so I don't have to use softmax here
        outputs = self.fc(outputs)
        return outputs

  def _init_weights(self, module):
         if isinstance(module, nn.Linear):
             module.weight.data.uniform_(mean=0.0, std=1.0)
             if module.bias is not None:

Any suggestions for me? Thank you in advance