Hi there,
I am quite new to pytorch so excuse me if I don’t get obvious things right…
I trained a biomedical NER tagger using BioBERT’s pre-trained BERT model, fine-tuned on GENETAG dataset using huggingface’s transformers library. I think it went through and I had an F1 of about 90%. I am now left with this:
.
├── checkpoint-1500
│ ├── config.json
│ ├── optimizer.pt
│ ├── pytorch_model.bin
│ ├── scheduler.pt
│ └── training_args.bin
├── checkpoint-2250
│ ├── config.json
│ ├── optimizer.pt
│ ├── pytorch_model.bin
│ ├── scheduler.pt
│ └── training_args.bin
├── checkpoint-750
│ ├── config.json
│ ├── optimizer.pt
│ ├── pytorch_model.bin
│ ├── scheduler.pt
│ └── training_args.bin
├── config.json
├── eval_results.txt
├── pytorch_model.bin
├── special_tokens_map.json
├── test_predictions.txt
├── test_results.txt
├── tokenizer_config.json
├── training_args.bin
└── vocab.txt
where I proceed to make the model ready for prediction like this
class Ner:
def __init__(self, model_dir: str):
self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
self.label_map = self.model_config["id2label"]
self.max_seq_length = self.model_config["max_seq_length"]
self.label_map = {int(k):v for k,v in self.label_map.items()}
self.device = "cuda" if torch.cuda.is_available() else "cpu"
self.model = self.model.to(self.device)
self.model.eval()
def load_model(self, model_dir: str, model_config: str = "config.json"):
model_config = os.path.join(model_dir,model_config)
model_config = json.load(open(model_config))
model = AutoModelForTokenClassification.from_pretrained("biobert_v1.1_pubmed")
ckpt = torch.load(model_dir + "/pytorch_model.bin")
model.load_state_dict(ckpt)
tokenizer = BertTokenizer.from_pretrained("biobert_v1.1_pubmed", do_lower_case=model_config["do_lower"])
return model, tokenizer, model_config
def tokenize(self, text: str):
""" tokenize input"""
words = word_tokenize(text)
tokens = []
valid_positions = []
for i,word in enumerate(words):
token = self.tokenizer.tokenize(word)
tokens.extend(token)
for i in range(len(token)):
if i == 0:
valid_positions.append(1)
else:
valid_positions.append(0)
return tokens, valid_positions
def preprocess(self, text: str):
""" preprocess """
tokens, valid_positions = self.tokenize(text)
## insert "[CLS]"
tokens.insert(0,"[CLS]")
valid_positions.insert(0,1)
## insert "[SEP]"
tokens.append("[SEP]")
valid_positions.append(1)
segment_ids = []
for i in range(len(tokens)):
segment_ids.append(0)
input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
input_mask = [1] * len(input_ids)
while len(input_ids) < self.max_seq_length:
input_ids.append(0)
input_mask.append(0)
segment_ids.append(0)
valid_positions.append(0)
return input_ids,input_mask,segment_ids,valid_positions
def predict(self, text: str):
input_ids,input_mask,segment_ids,valid_ids = self.preprocess(text)
input_ids = torch.tensor([input_ids],dtype=torch.long,device=self.device)
input_mask = torch.tensor([input_mask],dtype=torch.long,device=self.device)
segment_ids = torch.tensor([segment_ids],dtype=torch.long,device=self.device)
valid_ids = torch.tensor([valid_ids],dtype=torch.long,device=self.device)
with torch.no_grad():
logits = self.model(input_ids, segment_ids, input_mask,valid_ids)[0]
print(type(logits))
print(logits.shape)
# while True: print(eval(input()))
logits = F.softmax(logits,dim=2)
logits_label = torch.argmax(logits,dim=2)
logits_label = logits_label.detach().cpu().numpy().tolist()[0]
logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]
logits = []
pos = 0
for index,mask in enumerate(valid_ids[0]):
if index == 0:
continue
if mask == 1:
logits.append((logits_label[index-pos],logits_confidence[index-pos]))
else:
pos += 1
logits.pop()
labels = [(self.label_map[label],confidence) for label,confidence in logits]
labels = [(label,confidence) for label,confidence in logits]
words = word_tokenize(text)
assert len(labels) == len(words)
output = [{"tag":label, "word":word,"confidence":confidence} for word,(label,confidence) in zip(words,labels)]
return output
this is taken and adjusted from here so I am really unsure whether I did the model loading correct!
Now when I do
model_dir = "/home/marcel/Desktop/transformers-master/examples/token-classification/BioBERT_ner/output"
ner = Ner(model_dir)
out = ner.predict('Large T antigen was coimmunoprecipitated by antibodies to epitope-tagged TBP , endogenous TBP , hTAF ( II ) 100 , hTAF ( II ) 130 , and hTAF ( II ) 250 , under conditions where holo-TFIID would be precipitated .')
print(*out, sep="\n")
I get
{'tag': 0, 'word': 'Large', 'confidence': 0.9638992547988892}
{'tag': 0, 'word': 'T', 'confidence': 0.9511561989784241}
{'tag': 0, 'word': 'antigen', 'confidence': 0.8594477772712708}
{'tag': 0, 'word': 'was', 'confidence': 0.9952380657196045}
{'tag': 0, 'word': 'coimmunoprecipitated', 'confidence': 0.9960350394248962}
{'tag': 0, 'word': 'by', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'antibodies', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'to', 'confidence': 0.9781185984611511}
{'tag': 0, 'word': 'epitope-tagged', 'confidence': 0.9603662490844727}
{'tag': 0, 'word': 'TBP', 'confidence': 0.9726290106773376}
{'tag': 0, 'word': ',', 'confidence': 0.9269258975982666}
{'tag': 0, 'word': 'endogenous', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'TBP', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'hTAF', 'confidence': 0.8735837340354919}
{'tag': 0, 'word': '(', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'II', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': ')', 'confidence': 0.9799841046333313}
{'tag': 0, 'word': '100', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'hTAF', 'confidence': 0.8735837340354919}
{'tag': 0, 'word': '(', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'II', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': ')', 'confidence': 0.8922699093818665}
{'tag': 0, 'word': '130', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'and', 'confidence': 0.9816873669624329}
{'tag': 0, 'word': 'hTAF', 'confidence': 0.9520930051803589}
{'tag': 0, 'word': '(', 'confidence': 0.9822953939437866}
{'tag': 0, 'word': 'II', 'confidence': 0.9713449478149414}
{'tag': 0, 'word': ')', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': '250', 'confidence': 0.8922699093818665}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'under', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'conditions', 'confidence': 0.9816873669624329}
{'tag': 0, 'word': 'where', 'confidence': 0.9520930051803589}
{'tag': 0, 'word': 'holo-TFIID', 'confidence': 0.9822953939437866}
{'tag': 0, 'word': 'would', 'confidence': 0.9650858044624329}
{'tag': 0, 'word': 'be', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': 'precipitated', 'confidence': 0.9866456389427185}
{'tag': 0, 'word': '.', 'confidence': 0.8922699093818665}
which is definitely not correct. What am I doing wrong?