Hi,
Thank you for your answer.
Following your suggestion, here are parts of the code in order to make things more clear.
So, once I have read the data, I create batches with my own function create_batches:
batch_training_data = create_batches(nn_params, training_data, word_to_ix, tag_to_ix, char_to_ix)
batch_dev_data = create_batches(nn_params, dev_data, word_to_ix, tag_to_ix, char_to_ix)
batch_training_data and batch_dev_data are lists.
word_to_ix, tag_to_ix and char_to_ix are maps from string to indices, just like those in the tutorials in the pytorch website.
Each element of the list is a tuple containing 3 LongTensors:
in_batch = torch.LongTensor(nn_params.max_sentence_length, nn_params.batch_size)
out_batch = torch.LongTensor(nn_params.max_sentence_length, nn_params.batch_size)
char_batch = torch.LongTensor(nn_params.max_sentence_length, nn_params.max_token_length, nn_params.batch_size)
in_batch contains tokens, out_batch labels and char_batch characters of each tokens.
With data I’m using, nn_params.max_sentence_length is 96, nn_params.batch_size is 25 and nn_params.max_token_length 17, these are dimensions I mentioned in my first post.
Now, the model is made of:
self.word_embeddings = nn.Embedding(self.vocab_size, nn_params.embedding_dim)
self.char_embeddings = nn.Embedding(self.char_vocab_size, nn_params.char_embed_dim)
self.label_embeddings = nn.Embedding(self.tagset_size, nn_params.label_embed_dim)
self.embed_dropout = nn.Dropout(p=0.5)
self.charRNN = nn.LSTM(nn_params.char_embed_dim, nn_params.char_hidden_dim, bidirectional=nn_params.bilstm) # character-level LSTM
rnn_input_size = nn_params.embedding_dim + nn_params.label_embed_dim + self.num_directions * nn_params.char_hidden_dim
self.RNN = nn.LSTM(rnn_input_size, nn_params.hidden_dim, bidirectional=nn_params.bilstm) # global model hidden layer
self.hidden_dropout = nn.Dropout(p=0.5)
self.hidden2tag = nn.Linear(self.num_directions * nn_params.hidden_dim, self.tagset_size) # mapping from hidden to tag space
self.aux_hidden2tag = nn.Linear(nn_params.hidden_dim, self.tagset_size) # auxiliary mapping for forward and backward steps
# Main hidden layer state (for self.RNN)
self.hidden = (autograd.Variable(torch.zeros(self.num_directions, self.batch_size, self.hidden_dim).type(dtype), requires_grad=False, volatile=(self.TEST==1)),
autograd.Variable(torch.zeros(self.num_directions, self.batch_size, self.hidden_dim).type(dtype), requires_grad=False, volatile=(self.TEST==1)))
# Character-level hidden layer state (for self.charRNN)
self.char_hidden = (autograd.Variable(torch.zeros(self.num_directions, self.batch_size, self.char_hidden_dim).type(dtype), requires_grad=False, volatile=(self.TEST==1)),
autograd.Variable(torch.zeros(self.num_directions, self.batch_size, self.char_hidden_dim).type(dtype), requires_grad=False, volatile=(self.TEST==1)))
Basically, since I have to re-inject previous predicted labels as input to the model, I run forward and backward steps of the LSTM « by hand ». I know this is not the best choice, as it will be slower, and also I do it by running twice a bidirectional LSTM, but I want to try this to compare with my previous models coded in octave, which were running faster and giving better results (so far…).
In the following:
-
dtype is torch.FloatTensor or torch.cuda.FloatTensor depending if coda is available or not.
-
num_directions depends on a flag telling if LSTMs are bidirectional or not
- There are a couple of intermediate variables needed to store character-level representations (char_rep), bidirectional hidden states (hidden_state), forward pass output (fw_scores), backward pass output (bw_scores), and bidirectional output (output)
The forward function of the model looks like:
vflag = (self.TEST==1)
(sequence_length, token_length, batch_size) = char_sequence.size()
# 1. Character-level LSTM for computing character-level representations, which are saved in _char_rep_
char_rep = autograd.Variable(torch.zeros(sequence_length, batch_size, self.num_directions * self.char_hidden_dim).type(dtype), volatile=vflag)
for i in range(sequence_length):
self.char_hidden = (autograd.Variable(torch.zeros(self.num_directions, self.batch_size, self.char_hidden_dim).type(dtype), requires_grad=False, volatile=vflag),
autograd.Variable(torch.zeros(self.num_directions, self.batch_size, self.char_hidden_dim).type(dtype), requires_grad=False, volatile=vflag))
char_embeds = self.char_embeddings( char_sequence[i,:,:] )
lstm_out, self.char_hidden = self.charRNN(char_embeds, self.char_hidden)
char_rep[i, :, 0:self.char_hidden_dim] = self.char_hidden[0][0, :, :]
char_rep[i, :, self.char_hidden_dim:2*self.char_hidden_dim] = self.char_hidden[0][1, :, :]
word_embeds = self.embed_dropout( self.word_embeddings(sentence) )
old_hidden = self.hidden # save hidden state to reset it after forward step
# 2. Backward pass
bw_scores = autograd.Variable(torch.zeros(sentence_length, batch_size, self.tagset_size).type(dtype), requires_grad=False, volatile=vflag)
hidden_state = autograd.Variable(torch.zeros(sentence_length, batch_size, self.num_directions * self.hidden_dim).type(dtype), requires_grad=False, volatile=vflag)
prev_labels = labels[-1,:]
for i in range(sentence_length-1,-1,-1):
label_embeds = self.embed_dropout( self.label_embeddings(prev_labels) )
total_input = torch.cat( [word_embeds[i,:,:].view(1, batch_size, -1), char_rep[i,:,:].view(1, batch_size, -1), label_embeds.view(1, batch_size, -1)], 2 )
lstm_out, self.hidden = self.RNN(total_input, self.hidden)
hidden_state[i,:,self.hidden_dim:2*self.hidden_dim] = lstm_out[:,:,self.hidden_dim:2*self.hidden_dim]
bw_output = F.log_softmax( self.aux_hidden2tag( self.hidden_dropout(hidden_state[i,:,self.hidden_dim:2*self.hidden_dim]) ) )
(max_scores, max_indeces) = torch.max(bw_output, 1)
bw_scores[i,:,:] = bw_output
prev_labels = max_indeces
# 3. Forward pass
fw_scores = autograd.Variable(torch.zeros(sentence_length, batch_size, self.tagset_size).type(dtype), requires_grad=False, volatile=vflag)
# We re-initialize the hidden state at its orginal value to compare equal to the bi-lstm performed in one shot by the PyTorch LSTM components
self.hidden = old_hidden
prev_labels = labels[0,:]
for i in range(len(sentence)):
label_embeds = self.embed_dropout( self.label_embeddings(prev_labels) )
total_input = torch.cat( [word_embeds[i,:,:].view(1, batch_size, -1), char_rep[i,:,:].view(1, batch_size, -1), label_embeds.view(1, batch_size, -1)], 2 )
lstm_out, self.hidden = self.RNN(total_input, self.hidden)
hidden_state[i,:,0:self.hidden_dim] = lstm_out[:,:,0:self.hidden_dim]
fw_output = F.log_softmax( self.aux_hidden2tag( self.hidden_dropout(hidden_state[i,:,0:self.hidden_dim]) ) )
(max_scores, max_indeces) = torch.max(fw_output, 1)
fw_scores[i,:,:] = fw_output
prev_labels = max_indeces
# 4. Computes bidirectional label predictions
output = autograd.Variable(torch.zeros(sentence_length, batch_size, self.tagset_size).type(dtype), volatile=vflag)
for i in range(len(sentence)):
output[i,:,:] = F.log_softmax( self.hidden2tag( self.hidden_dropout(hidden_state[i,:,:].view(batch_size,-1)) ) )
return output
I use 2 optimisers, one for the whole model and one for the auxiliary model, used to compute forward and backward predictions only:
optimizer = optim.Adadelta( model.parameters() )
aux_optimizer = optim.Adadelta( model.aux_hidden2tag.parameters() )
I know also this is not the best choice as the auxiliary optimiser is computing many gradients that will not be used, however…
The main train loop looks like this:
for epoch in range(args.epochs): # args.epochs is the number of epochs passed in from command line
train_loss = 0
model.TEST = 0 # Flag used to know if we are in training or testing mode, so that to make variables volatile
for i in torch.randperm( len(batch_training_data) ): # looping over shuffled training batches
model.init_hidden() # re-init the hidden state for each batch
nn_input = prepare_batch(batch_training_data[i][0], model.CUDA, model.TEST)
nn_output = prepare_batch(batch_training_data[i][1], model.CUDA, model.TEST)
input = [nn_input]
if nn_params.char_features: # flag telling if we use character features
input.append( prepare_batch(batch_training_data[i][2], model.CUDA, model.TEST) )
if nn_params.label_features: # flag telling if we use label features
input.append( nn_output )
output = model(input) # output contains bidirectional, forward and backward scores
tag_scores = output[0] # bidirectional scores
if nn_params.label_features:
fw_scores = output[1] # forward scores, created by the « auxiliary module » only when using label features
bw_scores = output[2] # backward scores, created by the « auxiliary module » only when using label features
fw_loss = loss_function(fw_scores.view(sentence_length * batch_size, -1), nn_output.view(sentence_length*batch_size))
aux_optimizer.zero_grad() # optimiser only for the « auxiliary module »
fw_loss.backward(retain_variables=True) # parts of the graph are shared with the bidirectional model, so I keep the graph
aux_optimizer.step()
bw_loss = loss_function(bw_scores.view(sentence_length * batch_size, -1), nn_output.view(sentence_length*batch_size))
aux_optimizer.zero_grad()
bw_loss.backward(retain_variables=True)
aux_optimizer.step()
loss = loss_function(tag_scores.view(sentence_length * batch_size, -1), nn_output.view(sentence_length*batch_size))
train_loss = train_loss + loss.data[0]
optimizer.zero_grad()
loss.backward()
optimizer.step()
I know this is a lot of code and a lot of details, so, if you, or anyone else, take the time to take a look that would be very kind. Please don’t hesitate to ask for clarifications if you don’t understand anything.
Thank you so much in advance