Augment Char Embeddings with Word Embeddings in NER Tagger

elsheikh21 · April 24, 2020, 8:32pm

I am trying to augment Char Embeddings with Word Embeddings to enhance my NER model, the problem is the code is super slow (locally) and it crashes Colab every time, any ideas on how to improve it (performance-wise)

Here is my code

class DualTagger(nn.Module):
    def __init__(self, hparams):
        super(DualTagger, self).__init__()
        self._device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.word_embedding = nn.Embedding(hparams.vocab_size, hparams.embedding_dim)

        self.char_embedding = nn.Embedding(hparams.char_vocab_size, hparams.char_embedding_dim)

        self.char_lstm = nn.LSTM(hparams.char_embedding_dim, hparams.char_hidden_dim)

        self.lstm = nn.LSTM(hparams.embedding_dim + hparams.char_hidden_dim,
                            hparams.hidden_dim)

        # self.dropout = nn.Dropout(hparams.dropout)
        self.hidden2tag = nn.Linear(hparams.hidden_dim, hparams.num_classes)

        self.hidden_char = (autograd.Variable(torch.zeros(1, 1, hparams.char_hidden_dim)).to(self._device),
                            autograd.Variable(torch.zeros(1, 1, hparams.char_hidden_dim)).to(self._device))
        self.hidden_words = (autograd.Variable(torch.zeros(1, 1, hparams.hidden_dim)).to(self._device),
                             autograd.Variable(torch.zeros(1, 1, hparams.hidden_dim)).to(self._device))

    def forward(self, sentence, word_tensors_):
        logits_ = None
        for idx, word_idx in enumerate(sentence):
            word_chars_tensor = word_tensors_.get(int(word_idx))
            char_embeds = self.char_embedding(word_chars_tensor)
            lstm_char_out, self.hidden_char = self.char_lstm(char_embeds.view(len(word_chars_tensor), 1, -1), self.hidden_char)
            word_embed = self.word_embedding(word_idx)
            embeds_cat = torch.cat((word_embed.view(1, 1, -1), lstm_char_out[-1].view(1, 1, -1)), dim=2)
            lstm_out, self.hidden_words = self.lstm(embeds_cat, self.hidden_words)
            logits = self.hidden2tag(lstm_out.view(1, -1))
            if idx == 0:
                logits_ = logits
            else:
                logits_ = torch.cat((logits_, logits), dim=0)
        return logits_


class HyperParameters():
    def __init__(self, model_name_, vocab, char_vocab, label_vocab, embeddings_, batch_size_):
        self.model_name = model_name_

        self.char_vocab_size, self.vocab_size = len(char_vocab), len(vocab)
        self.hidden_dim, self.char_hidden_dim = 256, 256
        self.embedding_dim, self.char_embedding_dim = 300, 300

        self.num_classes = len(label_vocab)
        self.bidirectional = False
        self.num_layers = 1
        self.dropout = 0.4
        self.embeddings = embeddings_
        self.batch_size = batch_size_


class Trainer(object):
    def __init__(self, model, loss_function, optimizer, verbose):
        self.model = model
        self.loss_function = loss_function
        self.optimizer = optimizer
        self._verbose = verbose

    def train(self, train_dataset: Dataset, valid_dataset: Dataset, epochs: int = 1, word_tensors=None):
        train_loss = 0.0
        for epoch in tqdm(range(epochs), desc="Training Epochs"):
            epoch_loss = 0.0
            self.model.train()
            for step, sample in tqdm(enumerate(train_dataset), desc='Training Batches'):
                inputs = sample['inputs']
                labels = sample['outputs']

                self.optimizer.zero_grad()

                for input_, tags in zip(tqdm(inputs), labels):
                    input_ = input_[input_.nonzero()]
                    predictions = self.model(input_, word_tensors)
                    predictions = predictions.view(-1, predictions.shape[-1])
                    tags = tags[tags.nonzero()].view(-1)

                    sample_loss = self.loss_function(predictions, tags)
                    sample_loss.backward(retain_graph=True)
                    train_loss += sample_loss.item()

                clip_grad_norm_(self.model.parameters(), 5.)  # Gradient Clipping
                self.optimizer.step()

                epoch_loss += sample_loss.tolist()

            avg_epoch_loss = epoch_loss / len(train_dataset)
            train_loss += avg_epoch_loss

            valid_loss = self.evaluate(valid_dataset, word_tensors)

            if self._verbose > 0:
                print(f'Epoch {epoch}: [loss = {avg_epoch_loss:0.4f},  val_loss = {valid_loss:0.4f}]')
        avg_epoch_loss = train_loss / epochs
        return avg_epoch_loss

    def evaluate(self, valid_dataset, word_tensors):
        valid_loss = 0.0
        self.model.eval()
        with torch.no_grad():
            for sample in valid_dataset:
                inputs = sample['inputs']
                labels = sample['outputs']
                for input_, tags in zip(tqdm(inputs), labels):
                    input_ = input_[input_.nonzero()]
                    predictions = self.model(input_, word_tensors)
                    predictions = predictions.view(-1, predictions.shape[-1])
                    tags = tags[tags.nonzero()].view(-1)
                    sample_loss = self.loss_function(predictions, tags)
                    valid_loss += sample_loss.tolist()
        return valid_loss / len(valid_dataset)

    def predict(self, x):
        self.model.eval()
        with torch.no_grad():
            logits = self.model(x)
            predictions = torch.argmax(logits, -1)
            return logits, predictions

ptrblck · April 25, 2020, 8:49am

How does this code crash Colab? Do you get any error message?

elsheikh21 · April 25, 2020, 2:40pm

I don’t know, it just tells me that my session crashed for unknown reason and when I check the runtime log, I find these 2 messages

INFO|Adapting to protocol v5.1 for kernel acc9b289-66f6-4f94-ab15-df41913d1778|

WARNING|warn(“IPython.utils.traitlets has moved to a top-level traitlets package.”)|

ptrblck · April 25, 2020, 10:45pm

From my experience Jupyter notebooks don’t give you the complete stack trace, if your kernel crashed.
Could you run the script locally in a terminal, which should give you the proper error?

elsheikh21 · April 25, 2020, 11:55pm

It does not give any errors, it runs but it takes about 30 mins per data sample (sentence) in my dataset, so it is very slow and I have 100K sentences,
I am using, Nividia GTX 950, Cuda 10.1 PyTorch 1.4.0

ptrblck · April 26, 2020, 1:49am

Could you post the necessary input shapes and all arguments so that we could run your script using random data?

elsheikh21 · April 26, 2020, 2:02am

of course,

Data:
input: [lst[str]] list of list of strings (tokens), [Batch Size, Max Seq Len] = > [128, 173]
ex: [ ["This", "is", "the", "first", "homework"], ["Barack", "Obama", "was", "elected"] ]
output: same list of list of strings (Labels), [Batch Size, Max Seq Len] = > [128, 173]
ex: [ ["O", "O", "O", "O", "O"], ["PER", "PER", "O", "O"] ]
Hyperparams:

loss_function = nn.CrossEntropy(ignore_index=0)
Optimizer = optim.Adam(model.params()) # default args
char_vocab_size, vocab_size = len(char_vocab), len(vocab) # chars_vocab 767, words_vocab 91584
hidden_dim, char_hidden_dim = 256, 256
embedding_dim, char_embedding_dim = 300, 300
num_classes = len(label_vocab) # 5 including "<PAD>" token
bidirectional = False
num_layers = 1
dropout = 0.4
embeddings = pretrained_embeddings # None
batch_size = 128
epochs = 10

Other libraries used: tqdm (progress bars)