[example] add vllm inference benchmark (#5080)

* [example] add vllm inference benchmark * [example] update vllm inference benchmark
2025-08-09 20:07:41 +00:00 · 2023-11-21 13:28:13 +08:00 · 2023-11-21 13:28:13 +08:00 · 42b2d6f3a5
commit 42b2d6f3a5
parent dce05da535
1 changed files with 89 additions and 0 deletions
--- a/examples/inference/benchmark_vllm.py
+++ b/examples/inference/benchmark_vllm.py
@ -0,0 +1,89 @@
+"""Benchmark the latency of processing a single batch of requests."""
+"""Modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_latency.py
+The only change is printing the throughput.
+"""
+import argparse
+import time
+
+import numpy as np
+import torch
+from tqdm import tqdm
+from vllm import LLM, SamplingParams
+
+
+def main(args: argparse.Namespace):
+    # Process all the requests in a single batch if possible.
+    # NOTE(woosuk): If the request cannot be processed in a single batch,
+    # the engine will automatically process the request in multiple batches.
+    llm = LLM(
+        model=args.model,
+        tokenizer=args.tokenizer,
+        quantization=args.quantization,
+        tensor_parallel_size=args.tensor_parallel_size,
+        max_num_seqs=args.batch_size,
+        max_num_batched_tokens=args.batch_size * args.input_len,
+        trust_remote_code=args.trust_remote_code,
+        dtype=args.dtype,
+    )
+
+    sampling_params = SamplingParams(
+        n=args.n,
+        temperature=0.0 if args.use_beam_search else 1.0,
+        top_p=1.0,
+        use_beam_search=args.use_beam_search,
+        ignore_eos=True,
+        max_tokens=args.output_len,
+    )
+    dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
+
+    def run_to_completion(profile: bool = False):
+        if profile:
+            torch.cuda.cudart().cudaProfilerStart()
+        start_time = time.perf_counter()
+
+        llm.generate(prompt_token_ids=dummy_prompt_token_ids, sampling_params=sampling_params, use_tqdm=False)
+
+        end_time = time.perf_counter()
+        latency = end_time - start_time
+        if profile:
+            torch.cuda.cudart().cudaProfilerStop()
+        return latency
+
+    print("Warming up...")
+    run_to_completion(profile=False)
+
+    # Benchmark.
+    latencies = []
+    for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
+        latencies.append(run_to_completion(profile=False))
+    print(f"Avg latency: {np.mean(latencies)} seconds")
+    print(f"Avg throughput: {args.batch_size*args.output_len/np.mean(latencies)} tokens/s")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(
+        description="Benchmark the latency of processing a single batch of " "requests till completion."
+    )
+    parser.add_argument("--model", type=str, default="facebook/opt-125m")
+    parser.add_argument("--tokenizer", type=str, default=None)
+    parser.add_argument("--quantization", "-q", choices=["awq", "squeezellm", None], default=None)
+    parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
+    parser.add_argument("--input-len", type=int, default=32)
+    parser.add_argument("--output-len", type=int, default=128)
+    parser.add_argument("--batch-size", type=int, default=8)
+    parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.")
+    parser.add_argument("--use-beam-search", action="store_true")
+    parser.add_argument("--num-iters", type=int, default=3, help="Number of iterations to run.")
+    parser.add_argument("--trust-remote-code", action="store_true", help="trust remote code from huggingface")
+    parser.add_argument(
+        "--dtype",
+        type=str,
+        default="auto",
+        choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
+        help="data type for model weights and activations. "
+        'The "auto" option will use FP16 precision '
+        "for FP32 and FP16 models, and BF16 precision "
+        "for BF16 models.",
+    )
+    args = parser.parse_args()
+    main(args)