Recently, I want to measure the inference time, I design two cases, First I read model input from files, Second, each model input from the standard input.
While I found first case inference time is about 70 ms except the first call, the second case starts to be highly inconsistent, ranging from 72 ms to 483 ms.
Why this happened ? How can I make the second case inference time keep consistent ?
model = TokenClassificationModel.restore_from(restore_path=checkpoint_path)
# first case code
for line in open(txt_file, 'r'):
torch.cuda.synchronize()
start = time.perf_counter()
result = model.infer_one_text(text)
torch.cuda.synchronize()
elapsed_time = time.perf_counter() - start
print(f'elapsed_time:{elapsed_time:.6f}', flush=True)
# first case log
elapsed time:0.289745
elapsed time:0.072184
elapsed time:0.060471
elapsed time:0.062084
elapsed time:0.063837
model = TokenClassificationModel.restore_from(restore_path=checkpoint_path)
# second case code
while True:
text = input("text:")
torch.cuda.synchronize()
start = time.perf_counter()
result = model.infer_one_text(text)
torch.cuda.synchronize()
end = time.perf_counter() - start
print(f'elapsed time: {end:.6f}', flush=True)
# second case log
elapsed time:0.483897
elapsed time:0.089466
elapsed time:0.450116
elapsed time:0.072568
elapsed time:0.469203
def infer_one_text(self, text):
# Switch model to evaluation mode
device = 'cuda' if torch.cuda.is_available() else 'cpu'
self.eval()
self.to(device)
all_probs = []
input_ids, input_type_ids, input_mask, subtokens_mask = self.construct_bert_input(text)
logits = self.forward(
input_ids=input_ids.to(device),
token_type_ids=input_type_ids.to(device),
attention_mask=input_mask.to(device),
)
subtokens_mask = subtokens_mask > 0.5
preds = tensor2list(torch.argmax(logits, axis=-1)[subtokens_mask])
all_probs.append(preds)
preds = all_probs[0]
ids_to_labels = {v: k for k, v in self._cfg.label_ids.items()}
labels = [ids_to_labels[pred] for pred in preds]
return labels