[Inference]Optimize generation process of inference engine (#5356)

* opt inference engine

* fix run_benchmark.sh

* fix generate in engine.py

* rollback tesh_inference_engine.py
This commit is contained in:
yuehuayingxueluo
2024-02-02 15:38:21 +08:00
committed by GitHub
parent 21ad4a27f9
commit 631862f339
3 changed files with 21 additions and 16 deletions

View File

@@ -141,8 +141,7 @@ def benchmark_inference(args):
with ctx:
for _ in range(N_WARMUP_STEPS):
if args.mode == "caiinference":
engine.add_request(prompts_token_ids=data)
engine.generate(generation_config)
engine.generate(prompts_token_ids=data, generation_config=generation_config)
else:
engine.generate(data, generation_config=generation_config)
if args.profile:
@@ -156,8 +155,7 @@ def benchmark_inference(args):
whole_end2end = time.perf_counter()
if args.mode == "caiinference":
for _ in range(args.batch_size // mbsz):
engine.add_request(prompts_token_ids=data)
engine.generate(generation_config)
engine.generate(prompts_token_ids=data, generation_config=generation_config)
else:
for _ in range(args.batch_size // mbsz):
engine.generate(data, generation_config=generation_config)