[example] add vllm inference benchmark (#5080)

* [example] add vllm inference benchmark

* [example] update vllm inference benchmark
This commit is contained in:
Hongxin Liu 2023-11-21 13:28:13 +08:00 committed by FrankLeeeee
parent dce05da535
commit 42b2d6f3a5

View File

@ -0,0 +1,89 @@
"""Benchmark the latency of processing a single batch of requests."""
"""Modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_latency.py
The only change is printing the throughput.
"""
import argparse
import time
import numpy as np
import torch
from tqdm import tqdm
from vllm import LLM, SamplingParams
def main(args: argparse.Namespace):
# Process all the requests in a single batch if possible.
# NOTE(woosuk): If the request cannot be processed in a single batch,
# the engine will automatically process the request in multiple batches.
llm = LLM(
model=args.model,
tokenizer=args.tokenizer,
quantization=args.quantization,
tensor_parallel_size=args.tensor_parallel_size,
max_num_seqs=args.batch_size,
max_num_batched_tokens=args.batch_size * args.input_len,
trust_remote_code=args.trust_remote_code,
dtype=args.dtype,
)
sampling_params = SamplingParams(
n=args.n,
temperature=0.0 if args.use_beam_search else 1.0,
top_p=1.0,
use_beam_search=args.use_beam_search,
ignore_eos=True,
max_tokens=args.output_len,
)
dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size
def run_to_completion(profile: bool = False):
if profile:
torch.cuda.cudart().cudaProfilerStart()
start_time = time.perf_counter()
llm.generate(prompt_token_ids=dummy_prompt_token_ids, sampling_params=sampling_params, use_tqdm=False)
end_time = time.perf_counter()
latency = end_time - start_time
if profile:
torch.cuda.cudart().cudaProfilerStop()
return latency
print("Warming up...")
run_to_completion(profile=False)
# Benchmark.
latencies = []
for _ in tqdm(range(args.num_iters), desc="Profiling iterations"):
latencies.append(run_to_completion(profile=False))
print(f"Avg latency: {np.mean(latencies)} seconds")
print(f"Avg throughput: {args.batch_size*args.output_len/np.mean(latencies)} tokens/s")
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Benchmark the latency of processing a single batch of " "requests till completion."
)
parser.add_argument("--model", type=str, default="facebook/opt-125m")
parser.add_argument("--tokenizer", type=str, default=None)
parser.add_argument("--quantization", "-q", choices=["awq", "squeezellm", None], default=None)
parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1)
parser.add_argument("--input-len", type=int, default=32)
parser.add_argument("--output-len", type=int, default=128)
parser.add_argument("--batch-size", type=int, default=8)
parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.")
parser.add_argument("--use-beam-search", action="store_true")
parser.add_argument("--num-iters", type=int, default=3, help="Number of iterations to run.")
parser.add_argument("--trust-remote-code", action="store_true", help="trust remote code from huggingface")
parser.add_argument(
"--dtype",
type=str,
default="auto",
choices=["auto", "half", "float16", "bfloat16", "float", "float32"],
help="data type for model weights and activations. "
'The "auto" option will use FP16 precision '
"for FP32 and FP16 models, and BF16 precision "
"for BF16 models.",
)
args = parser.parse_args()
main(args)