I cannot for the life of me figure out what I am doing wrong.
I am trying to create a bi-directional lstm in order to do sentence classification, but I cannot manage to get it to learn. I don’t know if I am doing something wrong with the batching of the input.
I have build a model such as the following:
class RNNModel(nn.Moodule):
def __init__(self, vocab_size, weights):
self.drop = nn.Dropout(0.5)
self.embeddings = nn.Embedding(vocab_size, 300, padding_idx=vocab_size)
self.embeddings.weight.data.copy_(torch.from_numpy(weights))
hd = 5
self.rnn = LSTM(300, hd, 5, bidirectional=True, dropout=0.5, batch_first=True)
self.hidden2label = nn.Linear(hidden_dimensions * 2, 2)
def init_hidden(self, bsz):
weight = next(self.parameters()).data
hidden = (Variable(weight.new(10, bsz, self.hidden_dimensions).zero_()),
Variable(weight.new(10, bsz, self.hidden_dimensions).zero_()))
def forward(self, sentence, hidden):
x = self.embeddings(sentence)
x = self.drop(x)
output, hidden = self.rnn(x, hidden)
output = self.drop(output)
y = self.hidden2label(output[:, -1, :])
log_probs = F.log_softmax(y)
return log_probs, hidden
In order to train my network, I load the sentences in an array, load glove embeddings via gensim:
wv = KeyedVectors.load_word2vec_format('data/glove-w2v.6B.100d.txt', binary=False)
weights = wv.syn0
weights = np.append(weights, [
np.zeros(300).astype("float32")
], axis=0)
vocab_size = len(wv.vocab)
I instanciate my model:
model = RNNModel(vocab_size, weights)
and I try to train it as such (simplified version based on the seq2seq example):
spacy = spacy.load('en')
def embed_text(data, max_tokens=None):
sentences = []
biggest_sentence = 0
for sentence in data:
indexes = []
tokens_added = 0
doc = spacy(sentence)
words_in_sentence = len(doc)
if max_tokens is not None and words_in_sentence > max_tokens:
words_in_sentence = max_tokens
if words_in_sentence > biggest_sentence:
biggest_sentence = words_in_sentence
for token in doc:
word = token.text.lower()
if word in wv.vocab:
indexes.append(wv.vocab[word].index)
else:
pass # ignore unknown for now
tokens_added += 1
if max_tokens is not None and tokens_added == max_tokens:
break
sentences.append(indexes)
for s in sentences:
while len(s) < biggest_sentence:
# Zero padding
s.append(vocab_size)
return sentences # batch_size x max_sentence_length_in_batch
def repackage_hidden(h):
if type(h) == Variable:
return Variable(h.data)
else:
return tuple(repackage_hidden(v) for v in h)
epochs = 5
batch_size = 20
for epoch in range(1, epochs + 1):
model.train()
model.init_hidden()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for i in range(0, len(x), batch_size):
batch_range = min(batch_size, len(x) - i)
batch_x = embed_text(x[i:i + batch_range])
batch_x = Variable(torch.LongTensor(batch_x)).cuda()
batch_y = Variable(torch.LongTensor(y[i:i + batch_range])).cuda()
hidden = repackage_hidden(hidden)
model.zero_grad()
output, hidden = model(batch_x, hidden)
loss = criterion(output, batch_y)
loss.backward()
torch.nn.utils.clip_grad_norm(model.parameters(), clipping)
for p in model.parameters():
p.data.add_(-learning_rate, p.grad.data)
print('loss {:5.2f}'.format(loss.data))
optimizer.step()
Can someone give me any hints on what am I doing wrong?