Optimized the execution interval time between cuda kernels caused by view and memcopy (#5390)

* opt_view_and_memcopy * fix bugs in ci * fix ci bugs * update benchmark scripts * fix ci bugs
2025-09-01 17:17:05 +00:00 · 2024-02-21 13:23:57 +08:00
parent 730103819d
commit 2a718c8be8
8 changed files with 141 additions and 55 deletions
--- a/examples/inference/benchmark_llama.py
+++ b/examples/inference/benchmark_llama.py
@@ -95,7 +95,7 @@ def benchmark_inference(args):
        else:
            assert args.model_path, "When testing pretrained weights, the model path must be provided.'"
            model = transformers.LlamaForCausalLM.from_pretrained(args.model_path).cuda()
-            tokenizer = AutoTokenizer.from_pretrained(args.model_path)
+            tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")

        model = model.eval()

@@ -122,6 +122,7 @@ def benchmark_inference(args):
        elif args.mode == "vllm":
            engine = LLM(
                model=args.model_path,
+                tokenizer="hf-internal-testing/llama-tokenizer",
                max_num_seqs=mbsz,
                dtype="float16",
                enforce_eager=True,