I am working on a language model that’s trained on text sequences using one-hot encoding. I have on option for setting bidirectional to True, and I got it “working” (which just means the dimensions are correct and the program doesn’t crash), but there’s a big issue. When I run the code using bidirectional RNN, it trains, but the loss is instantly incredibly low, ~0.08 after only one epoch. I knew this didn’t seem right.
I tried sampling from the model using prime string of "The “, and the model predicts " ththththththththth”… and so on. I have no idea what’s causing this, but I’m guessing it has something to do with my data when doing bidirectional, because the program works perfectly fine without bidirection.
class CharRNN(nn.Module):
    def __init__(self, text, rnn_type='LSTM', bidirectional=False, n_hidden=512, n_layers=4, dropout=0.3, lr=2e-3, initrange=1, cuda=False, cudnn_fastest=False, cudnn_benchmark=False):

        self.rnn_type = rnn_type.upper()
        self.bidirectional = bidirectional
        self.drop = dropout
        self.n_hidden = n_hidden
        self.n_layers = n_layers = lr
        self.initrange = initrange

        self.use_cuda = cuda
        self.fastest = cudnn_fastest
        self.benchmark = cudnn_benchmark
        if cudnn_fastest: torch.backends.cudnn.fastest = True
        if cudnn_benchmark: torch.backends.cudnn.benchmark = True

        self.text = text
        self.int2char, self.char2int = get_lookup_tables(text)
        self.chars = tuple(self.char2int.keys())

        self.dropout = nn.Dropout(dropout)
        if rnn_type in ('LSTM', 'GRU'):
            self.rnn = getattr(nn, rnn_type)(len(self.chars), n_hidden, n_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
                nonlin = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError('An invalid option for `--rnntype` was supplied, valid options are `LSTM`, `GRU`, `RNN_TANH`, or `RNN_RELU`')

            self.rnn = nn.RNN(len(self.chars), n_hidden, n_layers, nonlinearity=nonlin, dropout=dropout, batch_first=True)
        self.decoder = nn.Linear(n_hidden*2 if bidirectional else n_hidden, len(self.chars))


        if not cuda and torch.cuda.is_available():
            print('WARNING: CUDA argument was set to false. Your device supports CUDA, you should use it.')
        if cuda:

    def forward(self, x, hc):
        ''' Forward pass through the network '''
        if self.bidirectional:
            x, h = self.rnn(x, hc)
            x = self.dropout(x)
            x = x.view(x.size(0)*x.size(1), self.n_hidden*2)
            x = self.decoder(x)
            x, h = self.rnn(x, hc)
            x = self.dropout(x)
            x = x.view(x.size(0)*x.size(1), self.n_hidden)
            x = self.decoder(x)

        return x, h

    def init_weights(self):
        ''' Initialize weights of decoder (fully connected layer) '''

        # Apply bias tensor to all zeros

        # Apply random uniform weights to decoder, self.initrange)

    def init_hidden(self, batch_size):
        ''' Initialize hidden state of rnn

                batch_size: batch size

                new weights torch Tensor
                if rnn type is an LSTM, returns a tuple of 2 of these weights
        # Create two new tensors with size of n_layers (x2 if bidirectional) x seq_len x n_hidden,
        # initialized to zero, for hidden state and cell state of RNN
        weight = next(self.parameters()).data
        if self.rnn_type == 'LSTM':
            return (weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden),
                    weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden))
            return weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden)

    def predict(self, char, h=None, cuda=False, top_k=None):
        ''' Predict the character after the given character

                char: starting character
                h: hidden state
                cuda: use cuda
                top_k: finds most probable next char from the output softmax

                predicted character
                h: hidden state

        if h is None: # If hidden state is not supplied, use a hidden state of sequence length of 1
            h = self.init_hidden(1)

        x = np.array([[self.char2int[char]]])
        x = one_hot_encode(x, len(self.chars))
        with torch.no_grad():
            inputs = torch.from_numpy(x)
        if self.use_cuda:
            inputs = inputs.cuda()
        with torch.no_grad():
            h = detach_hidden(h)
        out, h = self.forward(inputs, h)

        p = F.softmax(out).data
        if self.use_cuda:
            p = p.cpu()

        if not top_k:
            top_ch = np.arange(len(self.chars))
            p, top_ch = p.topk(top_k)
            top_ch = top_ch.numpy().squeeze()

        p = p.numpy().squeeze()
        char = np.random.choice(top_ch, p=p/p.sum())

        return self.int2char[char], h

Dude I have the same experience. Did you solve it?