Hello, I tried to complete the exercise on the LSTM POS tagger and implemented the char_level features with another LSTM and fed it into the main one by concatenating it to the original word embedding.
The code runs and trains( takes in as input the word+char embedding, but there’s no backprop on the char_lstm side. I verified this by printing some of the weights during the epochs. they remained constant.
@smth @ptrblck There isn’t a clean implementation of it out there to refer to. I was hoping this could be it
Any ideas? Thanks!
Code:
import string
from random import shuffle
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
torch.manual_seed(1)
def prepare_sequence(seq, to_ix):
idxs = [to_ix[w] for w in seq]
return torch.LongTensor(idxs)
training_data = [
("The dog ate the apple".split(), ["DET", "NN", "V", "DET", "NN"]),
("Everybody read that book".split(), ["NN", "V", "DET", "NN"])
]
# Dictionaries
word_to_ix = {}
tag_to_ix = {}
char_to_ix = {}
# Constants
CHAR_EMBEDDING_DIM = 6
CHAR_HIDDEN_DIM = 4
EMBEDDING_DIM = 6
HIDDEN_DIM = 6
# Computing Word & Tag Dictionaries
for sent, tags in training_data:
for word in sent:
if word not in word_to_ix:
word_to_ix[word] = len(word_to_ix)
for tag in tags:
if tag not in tag_to_ix:
tag_to_ix[tag] = len(tag_to_ix)
# Computing the Character Dictionary from the English Alphabet( case sensitive )
allchars = [i for i in string.ascii_lowercase + string.ascii_uppercase]
shuffle(allchars) # To not rely on any inherent ordering
for char in allchars:
if char not in char_to_ix:
char_to_ix[char] = len(char_to_ix)
class char_LSTM(nn.Module):
'''El Chapo'''
def __init__(self, char_embedding_dim, char_hidden_dim, charset_size):
super(char_LSTM, self).__init__()
self.char_hidden_dim = char_hidden_dim
self.char_embedding = nn.Embedding(charset_size, char_embedding_dim)
self.lstm = nn.LSTM(char_embedding_dim, char_hidden_dim)
self.char_hidden = self.init_hidden()
def init_hidden(self):
''' Intialize the hidden state'''
return (torch.rand(1,1,self.char_hidden_dim),
torch.rand(1,1,self.char_hidden_dim))
def forward(self,single_word):
''' Return the final hidden state a.k.a char embedding(This encodes dense character features )'''
char_embeds = self.char_embedding(single_word)
_, self.char_hidden = self.lstm(char_embeds.view(len(single_word),1,-1),self.char_hidden)
self.char_hidden = self.init_hidden()
return self.char_hidden[0]
class LSTMTagger(nn.Module):
'''GodFather'''
def __init__(self, embedding_dim, hidden_dim, char_embedding_dim, char_hidden_dim, vocab_size, tagset_size):
super(LSTMTagger, self).__init__()
self.hidden_dim = hidden_dim
self.word_embeddings = nn.Embedding(vocab_size, embedding_dim)
self.char_LSTM_embedding = char_LSTM(char_embedding_dim,char_hidden_dim,len(char_to_ix))
# note : LSTM input size is embedding_dim+char_hidden_dim to play nicely with concatenation
self.lstm = nn.LSTM(embedding_dim+char_hidden_dim, hidden_dim)
self.hidden2tag = nn.Linear(hidden_dim, tagset_size)
self.hidden = self.init_hidden()
def init_hidden(self):
''' Intialize the hidden state'''
return (torch.rand(1,1,self.hidden_dim),
torch.randn(1,1,self.hidden_dim))
def concat_embeddings(self,some_word_embedding_tensor, some_character_embedding_tensor):
''' Concatenate the word embedding and character embedding into a single tensor. Do this for all words'''
combo = []
for w,c in zip(some_word_embedding_tensor,some_character_embedding_tensor):
combo.append(torch.cat((w,c)))
return torch.stack(combo)
def forward(self, sentence, sentence_chars):
word_embeds = self.word_embeddings(sentence)
char_embeds = []
for single_word_char in sentence_chars:
# iterate through each word and append the character embedding to char_embeds
char_embeds.append(torch.squeeze(self.char_LSTM_embedding(single_word_char)))
# Concatenate the word embedding with the char embedding( i.e the hidden state from the char_LSTM for each word)
word_char_embeds = self.concat_embeddings(word_embeds, char_embeds)
lstm_out, self.hidden = self.lstm(word_char_embeds.view(len(sentence), 1, -1), self.hidden)
tag_space = self.hidden2tag(lstm_out.view(len(sentence),-1))
tag_scores = F.log_softmax(tag_space, dim=1)
return tag_scores