Why does using the BiLSTM produces worse results than the UniLSTM for text generation?

Shisho_Sama · October 20, 2019, 3:15pm

I created a very simple network and tried to create text like Anna Karenina book (which its text can be downloaded from here from the Gutenberg.org library)
I used normal LSTM and it created text such as :

...
epoch: 59/60 loss: 1.2538
generated text : 
the  ive been a definiteness
of hearing her husband the merchants

the standard of such as its a picture he said smiling to himself with a firstries of anna went to the shade in
alexey alexandrovitch whate

However, when I tried to use the Bidirectional LSTM for it, the loss decreased nearly 10x more but the text generation is aweful! :
This is a sample using bilstm :

...
epoch: 59/60 loss: 0.1313
generated text: 
the  a a atat dededededededededededededededededededededededededededededededed dodododododododododododododododododododododododododododododododododododod downwnen nenenenenenenensns
susususeseseseseststststs

This is my network which as you can see is pretty simple :

class lstm_char(nn.Module):
    def __init__(self, rnn_type='rnn', unique_chars=110, hidden_size=30,
                 num_layers=1, dropout=0.3, bidirection =False, act='tanh'):
        super().__init__()
        
        self.unique_char = unique_chars
        self.int2char = dict(enumerate(self.unique_char))
        self.char2int = {ch:ii for ii,ch in int2char.items()}
        self.input_size = len(unique_chars)
        self.output_size = self.input_size
        self.hidden_size = hidden_size
        self.drp = nn.Dropout(0.3)
        self.rnn_type = rnn_type.lower()

        self.direction = 2 if bidirection else 1

        if rnn_type.lower() == 'rnn':
            self.rnn = nn.RNN(self.input_size,
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout,
                            bidirectional=bidirection)
        
        elif rnn_type.lower() == 'gru':
            self.rnn = nn.GRU(self.input_size,
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout,
                            bidirectional=bidirection)
        else:
            self.rnn = nn.LSTM(self.input_size,
                            hidden_size,
                            num_layers,
                            batch_first=True,
                            dropout=dropout,
                            bidirectional=bidirection)
        
        self.fc = nn.Linear(hidden_size*self.direction, self.output_size)
    
    def forward(self, input, hidden_states):
        rnn_outputs, hidden_states = self.rnn(input,hidden_states)
        outputs = rnn_outputs.reshape(-1, self.hidden_size*self.direction)
        outputs = self.drp(outputs)
        outputs = self.fc(outputs)
        return outputs, hidden_states

and this is my training loop :

hidden_size = 512
layers_cnt = 1
bidirection = True
rnn_type = 'lstm'
device = torch.device('cuda' if torch.cuda.is_available()  else 'cpu')

model = lstm_char(rnn_type=rnn_type,
                  unique_chars=unique_chars, 
                  hidden_size=hidden_size,
                  num_layers=layers_cnt, 
                  bidirection=bidirection)

model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()

epochs =60
clip = 5.
interval = 1000 
batch_size = 128
label_length = len(unique_chars)
hidden_states = None 

val_ratio = 0.2
val_idx = int(corpus_digitized.size * (1-val_ratio))
train = corpus_digitized[:val_idx]
val = corpus_digitized[val_idx:]
assert train.size + val.size == corpus_digitized.size ,'they must be equal!'

for e in range(epochs):
    for i, (data, label) in enumerate(get_next_batch(train, batch_size,seq_len=seq_len)):

        model.train()
        data = torch.from_numpy(one_hot(data,length=label_length)).to(device)
        label = torch.from_numpy(label).to(device)

        output , hidden_states = model(data, hidden_states)
        if model.rnn_type == 'lstm':
            hidden_states = tuple(h.data for h in hidden_states)
        else:
            hidden_states = hidden_states.data 
        
        # label is the same 
        label = label.view(batch_size*seq_len).long()
        loss = criterion(output, label)
        optimizer.zero_grad()
        loss.backward()
        # note the _, which indicates the inplace operation!
        torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=5.)
        optimizer.step()
        if i%interval==0:
            print(f'epoch: {e}/{epochs} loss: {loss.item():.4f}')

Is this expect? or am I doing something wrong here?