I am using dynamic quantization on fine-tuned bert model. When I performed inference on quantized model before saving it, I am getting almost similar results(accuracy score) between unquantized and quantized model and reduction in inference time too.
However, when I load the quantized model and do inference on that, there is significant difference (around 30 to 40% decrease in accuracy) in the results, Is this because of way of loading the quantized model?
Any leads will be appreciable…
Thanks
Following is the code
def load_model(args):
config = BertConfig.from_pretrained(args.model_dir)
tokenizer = BertTokenizer.from_pretrained(
args.model_dir, do_lower_case=args.do_lower_case
)
model = BertForSequenceClassification.from_pretrained(args.model_dir, config=config)
return model, tokenizer
def predict_label(model, inputs):
with torch.no_grad():
outputs = model(**inputs)
logits = outputs[0]
logits = F.softmax(logits, dim=1)
logits_label = torch.argmax(logits, dim=1)
labels = logits_label.detach().cpu().numpy().tolist()
label_confidences = [
confidence[label].item() for confidence, label in zip(logits, labels)
]
return labels, label_confidences
def predict(eval_dataloader, model, examples, device):
index = 0
labels_for_evaluations = []
for batch in tqdm(eval_dataloader, desc="Evaluating"):
input_ids = batch["input_ids"]
mask_ids = batch["mask_ids"]
token_type_ids = batch["token_type_ids"]
input_ids = input_ids.to(device, dtype=torch.long)
mask_ids = mask_ids.to(device, dtype=torch.long)
token_type_ids = token_type_ids.to(device, dtype=torch.long)
inputs = {"input_ids": input_ids, "attention_mask": mask_ids}
predicted_labels, label_confidences = predict_label(model, inputs)
for confidence, pred_label in zip(label_confidences, predicted_labels):
labels_for_evaluations.append(str(pred_label))
return labels_for_evaluations
if __name__ == "__main__":
examples, labels = read_tsv_file(args.data_file)
bert_model, tokenizer = load_model(args)
bert_model.to(args.device)
# perform quantization
quantized_model = quantization.quantize_dynamic(bert_model, {nn.Linear}, dtype=torch.qint8)
dataframe = pd.DataFrame({"text": examples})
batch_size = 1
print("quantized model ", quantized_model)
eval_dataloader = create_dataloader(
dataframe, tokenizer, args.max_seq_length, batch_size, test_data=True
)
# inference
positive_predicted_sentences, labels_for_evaluations = predict(
eval_dataloader, quantized_model, examples, args.device
)
# serialized the quatized model
quantized_output_dir = args.model_dir + "_quantized_batch1"
if not os.path.exists(quantized_output_dir):
os.makedirs(quantized_output_dir)
quantized_model.save_pretrained(quantized_output_dir)
tokenizer.save_pretrained(quantized_output_dir)
print("accuracy score ", accuracy_score(labels, labels_for_evaluations))
Update
I found many people are facing similar issue, when you load the quantized BERT model then there is huge decrease in accuracy. Here are related issues on github
Dynamic Quantization on ALBERT (pytorch) #2542
Quantized model not preserved when imported using from_pretrained() #2556