Hey, so my model is trained to perform NER (sequence tagging) using a BILSTM network. CRF layer is used as the classifier.
I have followed this tutorial to recreate a model on my own dataset: intro-to-nlp-with-pytorch/NamedEntityRecognition.ipynb at master · PythonWorkshop/intro-to-nlp-with-pytorch · GitHub
I obtain high accuracy on my train and test when the model is loaded and tested during the same instance. However, when I restart my kernel, it produces terrible results. I have tried saving the model both using torch.save(model) and torch.save(model.state_dict).
I have also tried loading all the layers (the bilstm, linear, embeddings) but the results persist to be very bad. I tried the experiment using Adam, RMSprop, and SGD optimizers
Adam and RMS prop produce very good results when the model is trained,saved, loaded and tested in the same kernel.
However after restarting the kernel, Sgd is slightly better with the results (still bad), whereas Adam and RMSprop are really bad.
Can someone please give me insights as to where I am going wrong.
class BiLSTM_CRF(nn.Module):
def __init__(self, vocab_size, tag_to_ix, embedding_dim, hidden_dim):
"""Initialize network."""
torch.use_deterministic_algorithms(True)
super(BiLSTM_CRF, self).__init__()
self.embedding_dim = embedding_dim
self.hidden_dim = hidden_dim
self.vocab_size = vocab_size
self.tag_to_ix = tag_to_ix
self.tagset_size = len(tag_to_ix)
self.word_embeds = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2,
num_layers=1, bidirectional=True)
self.dropout = torch.nn.Dropout(0.15)
# Maps the output of the LSTM into tag space.
self.hidden2tag = nn.Linear(hidden_dim, self.tagset_size)
# Matrix of transition parameters. Entry i,j is the score of
# transitioning *to* i *from* j.
set_seed(42)
state = torch.get_rng_state()
torch.use_deterministic_algorithms(True)
self.transitions = nn.Parameter(
torch.randn(self.tagset_size, self.tagset_size))
# print(self.transitions)
#torch.set_rng_state(state)
# These two statements enforce the constraint that we never transfer
# to the start tag and we never transfer from the stop tag
self.transitions.data[tag_to_ix[START_TAG], :] = -10000
self.transitions.data[:, tag_to_ix[STOP_TAG]] = -10000
self.hidden = self.init_hidden()
print(self.hidden)
Training code:
%%time
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
state = torch.get_rng_state()
losses = []
epochs = []
for epoch in range(10):
epochs.append(epoch)
for sentence, tags in train_data:
# Step 1. Remember that Pytorch accumulates gradients.
# We need to clear them out before each instance of LSTM
model.zero_grad()
sentence_in = prepare_sequence(sentence, word_to_ix)
targets = torch.tensor([tag_to_ix[t] for t in tags], dtype=torch.long)
loss = model.neg_log_likelihood(sentence_in, targets)
losses.append(loss.item())
loss.backward()
optimizer.step()
print("Epoch: {} Loss: {}".format(epoch+1, np.mean(losses)))
print(model.transitions)
print(model.lstm)
print(model.word_embeds)
print(model.hidden2tag)
torch.set_rng_state(state)
Saving the parameters:
torch.save(model.transitions, ‘transitions.pt’)
torch.save(model.word_embeds, ‘word_embeds.pt’)
torch.save(model.hidden2tag, ‘hidden.pt’)
torch.save(model.lstm, ‘lstm.pt’)
Saving the model:
torch.save(model, ‘saved_model/model_term_party_cause_rms.pt’)
Loading the model and the parameters:
import random
set_seed(42)
model2 = BiLSTM_CRF(len(word_to_ix), tag_to_ix, EMBEDDING_DIM, HIDDEN_DIM)
set_seed(42)
set_seed(42)
model2 = torch.load(‘saved_model/model_term_party_cause_rms.pt’)
set_seed(42)
model2.transitions = torch.load(‘transitions.pt’)
set_seed(42)
model2.word_embeds = torch.load(‘word_embeds.pt’)
set_seed(42)
model2.lstm = torch.load(‘lstm.pt’)
set_seed(42)
model2.hidden2tag = torch.load(‘hidden.pt’)
print(model2.transitions)
print(model2.hidden2tag)
print(model2.word_embeds)
print(model2.lstm)
model2.state_dict()
state = torch.get_rng_state()
Testing the model:
torch.backends.cudnn.deterministic = True
set_seed(42)
torch.set_rng_state(state)
model2.eval()
accuracies = []
predicted_tags = []
#testing the model. no need to accumulate gradients
with torch.no_grad():
for i in range(len(train_data)):
precheck_sent = prepare_sequence(train_data[i][0], word_to_ix)
pred = model2(precheck_sent)[1]
prediction = [ix_to_tag[idx] for idx in pred]
predicted_tags.append(prediction)
print('Prediction: ', [ix_to_tag[idx] for idx in pred])
print('Ground truth: ', train_data[i][1])
accuracy = sum(1 for x,y in zip([ix_to_tag[idx] for idx in pred] , train_data[i][1]) if x==y) / float(len(train_data[i][1]))
accuracies.append(accuracy)
for i in range(len(test_data)):
precheck_sent = prepare_sequence(test_data[i][0], word_to_ix)
pred = model2(precheck_sent)[1]
prediction = [ix_to_tag[idx] for idx in pred]
predicted_tags.append(prediction)
print('Prediction: ', [ix_to_tag[idx] for idx in pred])
print('Ground truth: ', test_data[i][1])
accuracy = sum(1 for x,y in zip([ix_to_tag[idx] for idx in pred] , test_data[i][1]) if x==y) / float(len(test_data[i][1]))
accuracies.append(accuracy)
print(accuracies)