How to use model for inference (biomed NER BERT Tagger)

Marcel_Braasch · May 24, 2020, 11:11pm

Hi there,

I am quite new to pytorch so excuse me if I don’t get obvious things right…

I trained a biomedical NER tagger using BioBERT’s pre-trained BERT model, fine-tuned on GENETAG dataset using huggingface’s transformers library. I think it went through and I had an F1 of about 90%. I am now left with this:

.
├── checkpoint-1500
│   ├── config.json
│   ├── optimizer.pt
│   ├── pytorch_model.bin
│   ├── scheduler.pt
│   └── training_args.bin
├── checkpoint-2250
│   ├── config.json
│   ├── optimizer.pt
│   ├── pytorch_model.bin
│   ├── scheduler.pt
│   └── training_args.bin
├── checkpoint-750
│   ├── config.json
│   ├── optimizer.pt
│   ├── pytorch_model.bin
│   ├── scheduler.pt
│   └── training_args.bin
├── config.json
├── eval_results.txt
├── pytorch_model.bin
├── special_tokens_map.json
├── test_predictions.txt
├── test_results.txt
├── tokenizer_config.json
├── training_args.bin
└── vocab.txt

where I proceed to make the model ready for prediction like this

class Ner:

    def __init__(self, model_dir: str):
        self.model , self.tokenizer, self.model_config = self.load_model(model_dir)
        self.label_map = self.model_config["id2label"]
        self.max_seq_length = self.model_config["max_seq_length"]
        self.label_map = {int(k):v for k,v in self.label_map.items()}
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        self.model = self.model.to(self.device)
        self.model.eval()

    def load_model(self, model_dir: str, model_config: str = "config.json"):
        model_config = os.path.join(model_dir,model_config)
        model_config = json.load(open(model_config))
        model = AutoModelForTokenClassification.from_pretrained("biobert_v1.1_pubmed")
        ckpt = torch.load(model_dir + "/pytorch_model.bin")
        model.load_state_dict(ckpt)
        tokenizer = BertTokenizer.from_pretrained("biobert_v1.1_pubmed", do_lower_case=model_config["do_lower"])
        return model, tokenizer, model_config

    def tokenize(self, text: str):
        """ tokenize input"""
        words = word_tokenize(text)
        tokens = []
        valid_positions = []
        for i,word in enumerate(words):
            token = self.tokenizer.tokenize(word)
            tokens.extend(token)
            for i in range(len(token)):
                if i == 0:
                    valid_positions.append(1)
                else:
                    valid_positions.append(0)
        return tokens, valid_positions

    def preprocess(self, text: str):
        """ preprocess """
        tokens, valid_positions = self.tokenize(text)
        ## insert "[CLS]"
        tokens.insert(0,"[CLS]")
        valid_positions.insert(0,1)
        ## insert "[SEP]"
        tokens.append("[SEP]")
        valid_positions.append(1)
        segment_ids = []
        for i in range(len(tokens)):
            segment_ids.append(0)
        input_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        input_mask = [1] * len(input_ids)
        while len(input_ids) < self.max_seq_length:
            input_ids.append(0)
            input_mask.append(0)
            segment_ids.append(0)
            valid_positions.append(0)
        return input_ids,input_mask,segment_ids,valid_positions

    def predict(self, text: str):
        input_ids,input_mask,segment_ids,valid_ids = self.preprocess(text)
        input_ids = torch.tensor([input_ids],dtype=torch.long,device=self.device)
        input_mask = torch.tensor([input_mask],dtype=torch.long,device=self.device)
        segment_ids = torch.tensor([segment_ids],dtype=torch.long,device=self.device)
        valid_ids = torch.tensor([valid_ids],dtype=torch.long,device=self.device)
        with torch.no_grad():
            logits = self.model(input_ids, segment_ids, input_mask,valid_ids)[0]
        print(type(logits))
        print(logits.shape)
        # while True:  print(eval(input()))
        logits = F.softmax(logits,dim=2)
        logits_label = torch.argmax(logits,dim=2)
        logits_label = logits_label.detach().cpu().numpy().tolist()[0]

        logits_confidence = [values[label].item() for values,label in zip(logits[0],logits_label)]

        logits = []
        pos = 0
        for index,mask in enumerate(valid_ids[0]):
            if index == 0:
                continue
            if mask == 1:
                logits.append((logits_label[index-pos],logits_confidence[index-pos]))
            else:
                pos += 1
        logits.pop()

        labels = [(self.label_map[label],confidence) for label,confidence in logits]
        labels = [(label,confidence) for label,confidence in logits]
        words = word_tokenize(text)
        assert len(labels) == len(words)
        output = [{"tag":label, "word":word,"confidence":confidence} for word,(label,confidence) in zip(words,labels)]
        return output

this is taken and adjusted from here so I am really unsure whether I did the model loading correct!

Now when I do

model_dir = "/home/marcel/Desktop/transformers-master/examples/token-classification/BioBERT_ner/output"
ner = Ner(model_dir)
out = ner.predict('Large T antigen was coimmunoprecipitated by antibodies to epitope-tagged TBP , endogenous TBP , hTAF ( II ) 100 , hTAF ( II ) 130 , and hTAF ( II ) 250 , under conditions where holo-TFIID would be precipitated .')
print(*out, sep="\n")

I get

{'tag': 0, 'word': 'Large', 'confidence': 0.9638992547988892}
{'tag': 0, 'word': 'T', 'confidence': 0.9511561989784241}
{'tag': 0, 'word': 'antigen', 'confidence': 0.8594477772712708}
{'tag': 0, 'word': 'was', 'confidence': 0.9952380657196045}
{'tag': 0, 'word': 'coimmunoprecipitated', 'confidence': 0.9960350394248962}
{'tag': 0, 'word': 'by', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'antibodies', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'to', 'confidence': 0.9781185984611511}
{'tag': 0, 'word': 'epitope-tagged', 'confidence': 0.9603662490844727}
{'tag': 0, 'word': 'TBP', 'confidence': 0.9726290106773376}
{'tag': 0, 'word': ',', 'confidence': 0.9269258975982666}
{'tag': 0, 'word': 'endogenous', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'TBP', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'hTAF', 'confidence': 0.8735837340354919}
{'tag': 0, 'word': '(', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'II', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': ')', 'confidence': 0.9799841046333313}
{'tag': 0, 'word': '100', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'hTAF', 'confidence': 0.8735837340354919}
{'tag': 0, 'word': '(', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'II', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': ')', 'confidence': 0.8922699093818665}
{'tag': 0, 'word': '130', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'and', 'confidence': 0.9816873669624329}
{'tag': 0, 'word': 'hTAF', 'confidence': 0.9520930051803589}
{'tag': 0, 'word': '(', 'confidence': 0.9822953939437866}
{'tag': 0, 'word': 'II', 'confidence': 0.9713449478149414}
{'tag': 0, 'word': ')', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': '250', 'confidence': 0.8922699093818665}
{'tag': 0, 'word': ',', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'under', 'confidence': 0.9970616698265076}
{'tag': 0, 'word': 'conditions', 'confidence': 0.9816873669624329}
{'tag': 0, 'word': 'where', 'confidence': 0.9520930051803589}
{'tag': 0, 'word': 'holo-TFIID', 'confidence': 0.9822953939437866}
{'tag': 0, 'word': 'would', 'confidence': 0.9650858044624329}
{'tag': 0, 'word': 'be', 'confidence': 0.9886957406997681}
{'tag': 0, 'word': 'precipitated', 'confidence': 0.9866456389427185}
{'tag': 0, 'word': '.', 'confidence': 0.8922699093818665}

which is definitely not correct. What am I doing wrong?

Marcel_Braasch · May 25, 2020, 12:38am

Okay so I managed to resolve the problem.
Maybe it helps someone so this is the script I wrote:

Note: utils_ner is in master/examples/token-classification. align_predictions is from run_ner in the same dir. The script basically follows the predictions in run_ner. Also, (at least here) the input data must be in CoNLL-2003 data format as well!

from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from utils_ner import NerDataset
import torch
import numpy as np
from torch import nn
from typing import Tuple, List

pretrained_dir = "./biobert_v1.1_pubmed"
cache_dir = "./BioBERT_ner/data"
config_dir = "./biobert_v1.1_pubmed/config.json"
model_dir = "./BioBERT_ner/output_3/pytorch_model.bin"
data_dir = "./BioBERT_ner/pred_data"
labels_dir = "./BioBERT_ner/data/labels.txt"
output_dir = "./BioBERT_ner"

label_map = {
    0: "O",
    1: "B"
    }

def align_predictions(predictions: np.ndarray, label_ids: np.ndarray) -> Tuple[List[int], List[int]]:
    preds = np.argmax(predictions, axis=2)

    batch_size, seq_len = preds.shape

    out_label_list = [[] for _ in range(batch_size)]
    preds_list = [[] for _ in range(batch_size)]

    for i in range(batch_size):
        for j in range(seq_len):
            if label_ids[i, j] != nn.CrossEntropyLoss().ignore_index:
                out_label_list[i].append(label_map[label_ids[i][j]])
                preds_list[i].append(label_map[preds[i][j]])

    return preds_list, out_label_list

tokenizer = AutoTokenizer.from_pretrained(pretrained_dir, cache_dir=cache_dir)
model = AutoModelForTokenClassification.from_pretrained(pretrained_dir, config=config_dir,cache_dir=cache_dir)
model.load_state_dict(torch.load(model_dir))
dataset = NerDataset(
    data_dir=data_dir,
    tokenizer=tokenizer,
    labels=["B", "O"],
    model_type="BertForTokenClassification",
    max_seq_length=256,
    mode="pred"
)

args = TrainingArguments(
	output_dir=output_dir
)

trainer = Trainer(
    model=model,
    args=args
)

predictions, label_ids, metrics = trainer.predict(dataset)
preds_list, _ = align_predictions(predictions, label_ids)
print(preds_list)```

rgranit · June 18, 2020, 12:21pm

Thanks for sharing your code! so this NER is only a binary classifier? or is it able to tell which type of entity was predicted by the model?

Marcel_Braasch · June 18, 2020, 1:27pm

Yes, I trained my model to be a binary classifier, so either an entity is a gene/protein or not because I simply want to calculate the medical word / word ratio to determine whether a document can be classified as medical or not. I uploaded my whole code with a high level explanation of every step here: https://github.com/marcelbra/DocTagger

You can easily use a different data set or use the same one but need to change the labels the model uses if you want to use GENETAG and differentiate between gene and protein.

run_ner and utils_ner are from transformers repo, with this u can train a model.
in doc_builder the actual tagging of a document happens (I’m working on CORD-19)
pred_ner is the actual prediction, I may have modified the script I posted earlier!

Hope it helps