Sentence classification with RNN - Batching

I cannot for the life of me figure out what I am doing wrong.

I am trying to create a bi-directional lstm in order to do sentence classification, but I cannot manage to get it to learn. I don’t know if I am doing something wrong with the batching of the input.

I have build a model such as the following:

class RNNModel(nn.Moodule):
     def __init__(self, vocab_size, weights):
          self.drop = nn.Dropout(0.5)
          self.embeddings = nn.Embedding(vocab_size, 300, padding_idx=vocab_size)
          self.embeddings.weight.data.copy_(torch.from_numpy(weights))

          hd = 5
          self.rnn = LSTM(300, hd, 5, bidirectional=True, dropout=0.5, batch_first=True)
          self.hidden2label = nn.Linear(hidden_dimensions * 2, 2)

    def init_hidden(self, bsz):
      weight = next(self.parameters()).data

      hidden = (Variable(weight.new(10, bsz, self.hidden_dimensions).zero_()),
                      Variable(weight.new(10, bsz, self.hidden_dimensions).zero_()))

    def forward(self, sentence, hidden):
          x = self.embeddings(sentence)
          x = self.drop(x)

          output, hidden = self.rnn(x, hidden)
          output = self.drop(output)

          y = self.hidden2label(output[:, -1, :])
          log_probs = F.log_softmax(y)

          return log_probs, hidden

In order to train my network, I load the sentences in an array, load glove embeddings via gensim:

 wv = KeyedVectors.load_word2vec_format('data/glove-w2v.6B.100d.txt', binary=False)
 weights = wv.syn0
 weights = np.append(weights, [ 
       np.zeros(300).astype("float32")
 ], axis=0)

 vocab_size = len(wv.vocab)

I instanciate my model:

 model = RNNModel(vocab_size, weights)

and I try to train it as such (simplified version based on the seq2seq example):

 spacy = spacy.load('en')

 def embed_text(data, max_tokens=None):
    sentences = []
    biggest_sentence = 0

    for sentence in data:
        indexes = []
        tokens_added = 0

        doc = spacy(sentence)
        words_in_sentence = len(doc)

        if max_tokens is not None and words_in_sentence > max_tokens:
            words_in_sentence = max_tokens

        if words_in_sentence > biggest_sentence:
            biggest_sentence = words_in_sentence

        for token in doc:
            word = token.text.lower()

            if word in wv.vocab:
                indexes.append(wv.vocab[word].index)
            else:
                pass # ignore unknown for now

            tokens_added += 1

            if max_tokens is not None and tokens_added == max_tokens:
                break

        sentences.append(indexes)

    for s in sentences:
        while len(s) < biggest_sentence:
            # Zero padding
            s.append(vocab_size)

    return sentences #  batch_size x max_sentence_length_in_batch

 def repackage_hidden(h):
    if type(h) == Variable:
        return Variable(h.data)
    else:
        return tuple(repackage_hidden(v) for v in h)

 epochs = 5
 batch_size = 20

 for epoch in range(1, epochs + 1):
      model.train()

      model.init_hidden()
      optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

      for i in range(0, len(x), batch_size):
            batch_range = min(batch_size, len(x) - i)

            batch_x = embed_text(x[i:i + batch_range])
            batch_x = Variable(torch.LongTensor(batch_x)).cuda()
            batch_y = Variable(torch.LongTensor(y[i:i + batch_range])).cuda()

            hidden = repackage_hidden(hidden)
            model.zero_grad()

            output, hidden = model(batch_x, hidden)

            loss = criterion(output, batch_y)
            loss.backward()

            torch.nn.utils.clip_grad_norm(model.parameters(), clipping)

            for p in model.parameters():
                p.data.add_(-learning_rate, p.grad.data)

            print('loss {:5.2f}'.format(loss.data))

      optimizer.step()

Can someone give me any hints on what am I doing wrong?

It’s been a while now and I haven’t been able to figure out what’s wrong. Has anyone any working example I could look into, using RNNs on a word level with word embeddings (glove, w2v etc)?

1 Like