Best way to extract output from GRU layer?

I am using GRU in PyTorch, The output of GRU contains three dim logits, but I need two dim for further calculation, I was wondering what is the best way to extract output from the final layer, I have few ideas regarding this:

import torch
import torch.nn as nn
from torch.nn.init import xavier_uniform_
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

import torch.nn.functional as F
import numpy as np



class BaseModel(nn.Module):

    def __init__(
        self,
        embed_vecs,
        dropout=0.2,
        activation='relu',
    ):
        super().__init__()
        self.embedding = nn.Embedding(len(embed_vecs), embed_vecs.shape[1], padding_idx=0)
        self.embedding.weight.data = embed_vecs.clone()
        self.embed_drop = nn.Dropout(p=dropout)
        self.activation = getattr(F, activation)

class BiGRU(BaseModel):

    def __init__(
        self,
        embed_vecs,
        out_dim,
        rnn_dim=512,
        rnn_layers=1,
        dropout=0.2,
        activation='tanh',
        **kwargs
    ):
        super(BiGRU, self).__init__(embed_vecs, dropout, activation, **kwargs)
        assert rnn_dim%2 == 0, """`rnn_dim` should be even."""

        # BiGRU
        emb_dim = embed_vecs.shape[1]
        self.rnn = nn.GRU(emb_dim, rnn_dim//2, rnn_layers,
                          bidirectional=True, batch_first=True)
        
        self.res = nn.Linear(rnn_dim, out_dim)


    def forward(self, input):
        text, length, indices = self.sort_data_by_length(input['text'], input['length'])

    
        x = self.embedding(text)
        x = self.embed_drop(x)

        packed_inputs = pack_padded_sequence(x, length, batch_first=True)
        x, _ = self.rnn(packed_inputs)
        x = pad_packed_sequence(x)
        x = x[0].permute(1, 0, 2)

        # output is three dim, I need two dim for further calculation


        # first method ( sum method )
        x_sum = x.sum(dim = 2)
        print("sum", x_sum.shape)


        # second method (reshape)
        x_reshape = x.reshape(x.shape[0],-1)
        print("res", x_reshape.shape)


        # third method ( taking last one )
        x_forth   = x[:,-1]
        print("forth",x_forth.shape)
        



        

    def sort_data_by_length(self, data, length):

        length = torch.as_tensor(length, dtype=torch.int64)
        length, sorted_indices = torch.sort(length, descending=True)
        sorted_indices = sorted_indices.to(data.device)
        data = data.index_select(0, sorted_indices)

        data_size = sorted_indices.size(-1)
        indices = torch.empty(data_size, dtype=torch.long)
        indices[sorted_indices] = torch.arange(data_size)

        return data, length, indices




data = torch.rand(40, 3)
data_out = BiGRU(data, 4)

datamm = [[0,12,11,1],[1,2,3,0],[1,1,0,0]]

print(torch.Tensor(datamm).type(torch.int32).shape)
inpi = {'text': torch.Tensor(datamm).type(torch.int32), 'length': torch.Tensor([4,3,2]).type(torch.int32)}
data_for = data_out(inpi)

Which method is better?

I would guess the choice depends on your use case.
While the 3rd approach might be the common one, I believe the 1st one might also work.
I would not use the 2nd one as flattening the temporal dimension into the batch dimension sound wrong to me.