Hi,
I am working on a language model that’s trained on text sequences using one-hot encoding. I have on option for setting bidirectional to True, and I got it “working” (which just means the dimensions are correct and the program doesn’t crash), but there’s a big issue. When I run the code using bidirectional RNN, it trains, but the loss is instantly incredibly low, ~0.08 after only one epoch. I knew this didn’t seem right.
I tried sampling from the model using prime string of "The “, and the model predicts " ththththththththth”… and so on. I have no idea what’s causing this, but I’m guessing it has something to do with my data when doing bidirectional, because the program works perfectly fine without bidirection.
Here’s the network:
class CharRNN(nn.Module):
def __init__(self, text, rnn_type='LSTM', bidirectional=False, n_hidden=512, n_layers=4, dropout=0.3, lr=2e-3, initrange=1, cuda=False, cudnn_fastest=False, cudnn_benchmark=False):
super().__init__()
self.rnn_type = rnn_type.upper()
self.bidirectional = bidirectional
self.drop = dropout
self.n_hidden = n_hidden
self.n_layers = n_layers
self.lr = lr
self.initrange = initrange
self.use_cuda = cuda
self.fastest = cudnn_fastest
self.benchmark = cudnn_benchmark
if cudnn_fastest: torch.backends.cudnn.fastest = True
if cudnn_benchmark: torch.backends.cudnn.benchmark = True
self.text = text
self.int2char, self.char2int = get_lookup_tables(text)
self.chars = tuple(self.char2int.keys())
self.dropout = nn.Dropout(dropout)
if rnn_type in ('LSTM', 'GRU'):
self.rnn = getattr(nn, rnn_type)(len(self.chars), n_hidden, n_layers, dropout=dropout, bidirectional=bidirectional, batch_first=True)
else:
try:
nonlin = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
except KeyError:
raise ValueError('An invalid option for `--rnntype` was supplied, valid options are `LSTM`, `GRU`, `RNN_TANH`, or `RNN_RELU`')
self.rnn = nn.RNN(len(self.chars), n_hidden, n_layers, nonlinearity=nonlin, dropout=dropout, batch_first=True)
self.decoder = nn.Linear(n_hidden*2 if bidirectional else n_hidden, len(self.chars))
self.init_weights()
if not cuda and torch.cuda.is_available():
print('WARNING: CUDA argument was set to false. Your device supports CUDA, you should use it.')
if cuda:
self.cuda()
else:
self.cpu()
def forward(self, x, hc):
''' Forward pass through the network '''
if self.bidirectional:
x, h = self.rnn(x, hc)
x = self.dropout(x)
x = x.view(x.size(0)*x.size(1), self.n_hidden*2)
x = self.decoder(x)
else:
x, h = self.rnn(x, hc)
x = self.dropout(x)
x = x.view(x.size(0)*x.size(1), self.n_hidden)
x = self.decoder(x)
return x, h
def init_weights(self):
''' Initialize weights of decoder (fully connected layer) '''
# Apply bias tensor to all zeros
self.decoder.bias.data.fill_(0)
# Apply random uniform weights to decoder
self.decoder.weight.data.uniform_(-self.initrange, self.initrange)
def init_hidden(self, batch_size):
''' Initialize hidden state of rnn
arguments:
batch_size: batch size
returns:
new weights torch Tensor
if rnn type is an LSTM, returns a tuple of 2 of these weights
'''
# Create two new tensors with size of n_layers (x2 if bidirectional) x seq_len x n_hidden,
# initialized to zero, for hidden state and cell state of RNN
weight = next(self.parameters()).data
if self.rnn_type == 'LSTM':
return (weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden),
weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden))
else:
return weight.new_zeros(self.n_layers*2 if self.bidirectional else self.n_layers, batch_size, self.n_hidden)
def predict(self, char, h=None, cuda=False, top_k=None):
''' Predict the character after the given character
arguments:
char: starting character
h: hidden state
cuda: use cuda
top_k: finds most probable next char from the output softmax
returns:
predicted character
h: hidden state
'''
if h is None: # If hidden state is not supplied, use a hidden state of sequence length of 1
h = self.init_hidden(1)
x = np.array([[self.char2int[char]]])
x = one_hot_encode(x, len(self.chars))
with torch.no_grad():
inputs = torch.from_numpy(x)
if self.use_cuda:
inputs = inputs.cuda()
with torch.no_grad():
h = detach_hidden(h)
out, h = self.forward(inputs, h)
p = F.softmax(out).data
if self.use_cuda:
p = p.cpu()
if not top_k:
top_ch = np.arange(len(self.chars))
else:
p, top_ch = p.topk(top_k)
top_ch = top_ch.numpy().squeeze()
p = p.numpy().squeeze()
char = np.random.choice(top_ch, p=p/p.sum())
return self.int2char[char], h
If someone could help, it would be great, thanks!