Underfitting with LSTM/GRU

Hi every friends, I am stuck on training LSTM model. The input of LSTM has the size format of (seq_len, batch_size, input_len ), it is (5, 8, 2048). The following code will show more details of my model.

    def __init__(self):
        self.layer_size = 1
        self.batch_size = 6
        self.hidden_size = 512
        self.tmestep = 3
        self.hidden = self.init_hidden_lstm()
        self.lstm = nn.LSTM(2048, self.hidden_size, self.layer_size,batch_first=False)
        self.drop = nn.Dropout(0.2)
        self.fc1 = nn.Linear(2560, 512)
        self.fc2 = nn.Linear(512, 2)

    def init_hidden_lstm(self):
       return (torch.randn(self.layer_size, self.batch_size,self.hidden_size,  requires_grad=True).cuda(),
               torch.randn(self.layer_size, self.batch_size, self.hidden_size, requires_grad=True).cuda())

    def init_weight(self):
        for name, param in self.lstm.named_parameters():
          if 'bias' in name:
             nn.init.constant(param, 0.5)
             nn.init.constant(param[256:512], 0.5)
          elif 'weight' in name:

    def forward(self,in1,in2,in3,in4,in5):
        # in1,in2,in3,in4,in5: extracted features from CNN network
        self.hidden = repackage_hidden(self.hidden)
        in = torch.stack([in1, in2, in3,in4,in5], dim=0)
        out, self.hidden = self.lstm(in,self.hidden)
        out = out.permute(1,0,2)
        out= out.contiguous().view(self.batch_size,-1)
        o = self.fc1(out)
        o = self.drop(o)
        o = self.fc2(o)

    def repackage_hidden(h):
        if isinstance(h, torch.Tensor):
              return h.detach()
              return tuple(repackage_hidden(v) for v in h)

Currently, the training accuracy fluctuates between 0.55 and 0.48 from the beginning until 60th epoch.
I tried LSTM with 2,4,8 layers with different learning_rate(1, 0.1, 0.01), but the similar case happens. If you can guess the reason, please reply. I am appreciate your help!