I created a very simple network and tried to create text like Anna Karenina book (which its text can be downloaded from here from the Gutenberg.org library)
I used normal LSTM and it created text such as :
...
epoch: 59/60 loss: 1.2538
generated text :
the ive been a definiteness
of hearing her husband the merchants
the standard of such as its a picture he said smiling to himself with a firstries of anna went to the shade in
alexey alexandrovitch whate
However, when I tried to use the Bidirectional LSTM for it, the loss decreased nearly 10x more but the text generation is aweful! :
This is a sample using bilstm :
...
epoch: 59/60 loss: 0.1313
generated text:
the a a atat dededededededededededededededededededededededededededededededed dodododododododododododododododododododododododododododododododododododod downwnen nenenenenenenensns
susususeseseseseststststs
This is my network which as you can see is pretty simple :
class lstm_char(nn.Module):
def __init__(self, rnn_type='rnn', unique_chars=110, hidden_size=30,
num_layers=1, dropout=0.3, bidirection =False, act='tanh'):
super().__init__()
self.unique_char = unique_chars
self.int2char = dict(enumerate(self.unique_char))
self.char2int = {ch:ii for ii,ch in int2char.items()}
self.input_size = len(unique_chars)
self.output_size = self.input_size
self.hidden_size = hidden_size
self.drp = nn.Dropout(0.3)
self.rnn_type = rnn_type.lower()
self.direction = 2 if bidirection else 1
if rnn_type.lower() == 'rnn':
self.rnn = nn.RNN(self.input_size,
hidden_size,
num_layers,
batch_first=True,
dropout=dropout,
bidirectional=bidirection)
elif rnn_type.lower() == 'gru':
self.rnn = nn.GRU(self.input_size,
hidden_size,
num_layers,
batch_first=True,
dropout=dropout,
bidirectional=bidirection)
else:
self.rnn = nn.LSTM(self.input_size,
hidden_size,
num_layers,
batch_first=True,
dropout=dropout,
bidirectional=bidirection)
self.fc = nn.Linear(hidden_size*self.direction, self.output_size)
def forward(self, input, hidden_states):
rnn_outputs, hidden_states = self.rnn(input,hidden_states)
outputs = rnn_outputs.reshape(-1, self.hidden_size*self.direction)
outputs = self.drp(outputs)
outputs = self.fc(outputs)
return outputs, hidden_states
and this is my training loop :
hidden_size = 512
layers_cnt = 1
bidirection = True
rnn_type = 'lstm'
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = lstm_char(rnn_type=rnn_type,
unique_chars=unique_chars,
hidden_size=hidden_size,
num_layers=layers_cnt,
bidirection=bidirection)
model = model.to(device)
optimizer = optim.Adam(model.parameters(), lr = 0.01)
criterion = nn.CrossEntropyLoss()
epochs =60
clip = 5.
interval = 1000
batch_size = 128
label_length = len(unique_chars)
hidden_states = None
val_ratio = 0.2
val_idx = int(corpus_digitized.size * (1-val_ratio))
train = corpus_digitized[:val_idx]
val = corpus_digitized[val_idx:]
assert train.size + val.size == corpus_digitized.size ,'they must be equal!'
for e in range(epochs):
for i, (data, label) in enumerate(get_next_batch(train, batch_size,seq_len=seq_len)):
model.train()
data = torch.from_numpy(one_hot(data,length=label_length)).to(device)
label = torch.from_numpy(label).to(device)
output , hidden_states = model(data, hidden_states)
if model.rnn_type == 'lstm':
hidden_states = tuple(h.data for h in hidden_states)
else:
hidden_states = hidden_states.data
# label is the same
label = label.view(batch_size*seq_len).long()
loss = criterion(output, label)
optimizer.zero_grad()
loss.backward()
# note the _, which indicates the inplace operation!
torch.nn.utils.clip_grad_norm_(model.parameters(),max_norm=5.)
optimizer.step()
if i%interval==0:
print(f'epoch: {e}/{epochs} loss: {loss.item():.4f}')
Is this expect? or am I doing something wrong here?