Fail to get onnx with torch.onnx.export

I’m try to export a onnx model base on torch.onnx.export for llama v2, using the script testLlame_om.py can be executed, but after it finish, there is no llama_opt.onnx generated without error message.

from transformers import LlamaForCausalLM, AutoTokenizer
import torch
import torch_npu
import time

torch.set_printoptions(profile="full")
# torch.npu.set_compile_mode(jit_compile=True)
device="npu:1"

#下载好的hf模型地址
hf_model_path = 'models--daryl149--llama-2-7b-hf'
Model = LlamaForCausalLM.from_pretrained(hf_model_path, device_map=device)
tokenizer = AutoTokenizer.from_pretrained(hf_model_path)

# print(Model)

class SumModule(torch.nn.Module):
   def forward(self, x):
       time_start = time.time()
       generate_ids = Model.generate(x, max_length=512)
       time_end = time.time()

def export():
    models = SumModule()
    input_ids = torch.tensor([1, 3923, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30, 3639, 596, 5655, 6975, 30])
    input_ids_npu = input_ids.to(device).unsqueeze(0)


    models.eval()  # set infer mode
    torch.onnx.export(
        models,
        (input_ids_npu),
        "llama_opt.onnx",
        opset_version=11,
        export_params=False,
        verbose=False,
        input_names=["x"],
        output_names=["z"],
    )


if __name__ == "__main__":
    export()

I could not reproduce any issues on CUDA. Do you see this issue only when npu is used?

can you get the file llama_opt.onnx base on your CUDA after it finish running?

as I don’t have a cuda machine, I don’t sure the issue only exist on npu.

PS:
optimum-cli export onnx --model models–daryl149–llama-2-7b-hf onnx_model2 --task text-generation can successful generate a onnx file, so the source code of models–daryl149–llama-2-7b-hf are complete.

refer to python - Convert LLaMA to ONNX - Stack Overflow