Forward function for a bidirectional RNN

I am working on a language model that’s trained on text sequences using one-hot encoding. I have on option for setting bidirectional to True, and I got it “working” (which just means the dimensions are correct and the program doesn’t crash), but there’s a big issue. When I run the code using bidirectional RNN, it trains, but the loss is instantly incredibly low, ~0.08 after only one epoch. I knew this didn’t seem right.
I tried sampling from the model using prime string of "The “, and the model predicts " ththththththththth”… and so on. I have no idea what’s causing this, but I’m guessing it has something to do with my data when doing bidirectional, because the program works perfectly fine without bidirection.
Here’s the network:

class CharRNN(nn.Module):
    def __init__(self, text, rnn_type='LSTM', bidirectional=False, n_hidden=512, n_layers=4, dropout=0.3, lr=2e-3, initrange=1, cuda=False, cudnn_fastest=False, cudnn_benchmark=False):

        self.rnn_type = rnn_type.upper()
        self.bidirectional = bidirectional
        self.drop = dropout
        self.n_hidden = n_hidden
        self.n_layers = n_layers = lr
        self.initrange = initrange

        self.use_cuda = cuda
        self.fastest = cudnn_fastest
        self.benchmark = cudnn_benchmark
        if cudnn_fastest: torch.backends.cudnn.fastest = True
        if cudnn_benchmark: torch.backends.cudnn.benchmark = True

        self.text = text
        self.int2char, self.char2int = get_lookup_tables(text)
        self.chars = tuple(self.char2int.keys())

        self.dropout = nn.Dropout(dropout)
        if rnn_type in ('LSTM', 'GRU'):
            self.rnn = getattr(nn, rnn_type)(len(self.chars), n_hidden, n_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
                nonlin = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError('An invalid option for `--rnntype` was supplied, valid options are `LSTM`, `GRU`, `RNN_TANH`, or `RNN_RELU`')

            self.rnn = nn.RNN(len(self.chars), n_hidden, n_layers, nonlinearity=nonlin, dropout=dropout, batch_first=True)
        self.decoder = nn.Linear(n_hidden*2 if bidirectional else n_hidden, len(self.chars))


        if not cuda and torch.cuda.is_available():
            print('WARNING: CUDA argument was set to false. Your device supports CUDA, you should use it.')
        if cuda:

    def forward(self, x, hc):
        ''' Forward pass through the network '''
        if self.bidirectional:
            x, h = self.rnn(x, hc)
            x = self.dropout(x)
            x = x.view(x.size(0)*x.size(1), self.n_hidden*2)
            x = self.decoder(x)
            x, h = self.rnn(x, hc)
            x = self.dropout(x)
            x = x.view(x.size(0)*x.size(1), self.n_hidden)
            x = self.decoder(x)

        return x, h

    def init_weights(self):
        ''' Initialize weights of decoder (fully connected layer) '''

        # Apply bias tensor to all zeros

        # Apply random uniform weights to decoder, self.initrange)

    def init_hidden(self, batch_size):
        ''' Initialize hidden state of rnn

                batch_size: batch size

                new weights torch Tensor
                if rnn type is an LSTM, returns a tuple of 2 of these weights
        # Create two new tensors with size of n_layers (x2 if bidirectional) x seq_len x n_hidden,
        # initialized to zero, for hidden state and cell state of RNN
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden),
                    weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden))
            return weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden)

    def predict(self, char, h=None, cuda=False, top_k=None):
        ''' Predict the character after the given character

                char: starting character
                h: hidden state
                cuda: use cuda
                top_k: finds most probable next char from the output softmax

                predicted character
                h: hidden state

        if h is None: # If hidden state is not supplied, use a hidden state of sequence length of 1
            h = self.init_hidden(1)

        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))
        with torch.no_grad():
            inputs = torch.from_numpy(x)
        if self.use_cuda:
            inputs = inputs.cuda()
        with torch.no_grad():
            h = detach_hidden(h)
        out, h = self.forward(inputs, h)

        p = F.softmax(out).data
        if self.use_cuda:
            p = p.cpu()

        if not top_k:
            top_ch = np.arange(len(self.chars))
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()

        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())

        return self.int2char[char], h

If someone could help, it would be great, thanks!

Dude I have the same experience. Did you solve it?