Hello, I’m trying to inference my model by torch.bfloat16 dtype in cpu.
But when I set model and inputs to torch.bfloat16. inference speed is much slower than float32 inference on AMD Ryzen 9 5900X 12-Core Processor.
Inference speed is 9.1iter/s on float32, 9.1s/iter on bfloat16.
But bfloat16 inference was okay on Intel(R) Xeon(R) Silver 4210R CPU @ 2.40GHz.
How can I know by python code if bfloat16 inference is okay on specific CPU device?
Thank you.
import torch
from tqdm import tqdm
from src.text_utils.multi_lingual_g2p import MultiLingualG2P
from src.models_vits.model_vits import VITSForInference
from src.base_utils import os, load_pth, load_config, load_df, load_json
DTYPE = torch.bfloat16
model_path = '{MODEL_PATH}'
configs = load_config(os.path.join(model_path, "config.yaml"))
model = VITSForInference(configs, "cpu").to(DTYPE)
model_state = {key : value.to(DTYPE) for key, value in load_pth(os.path.join(model_path, 'model.pth')).items()}
embeds = {key : value.to(DTYPE) for key, value in load_pth(os.path.join(model_path, 'embeds.pth')).items()}
model.load_state_dict(model_state)
model.eval()
model_info = load_json(os.path.join(model_path, "model_info.json"))
g2p = MultiLingualG2P()
text = "{INPUT_TEXT}"
language = model_info["language"][0]
phoneme = g2p.text_to_phoneme(text, language)[0]
text_sequence = g2p.phoneme_to_sequence(phoneme)
texts = torch.tensor(text_sequence, dtype=torch.long, device="cpu").unsqueeze(0)
style_embeds = embeds["embed_nat"].unsqueeze(0)
def inference(texts, style_embeds, d_control, p_control, noise_scale):
with torch.no_grad():
with torch.cpu.amp.autocast():
model.forward(texts, style_embeds, d_control=1.0, p_control=1.0, noise_scale=0.1)
for i in tqdm(range(100)):
inference(texts, style_embeds, d_control=1.0, p_control=1.0, noise_scale=0.1)