diff --git a/examples/inference/benchmark_vllm.py b/examples/inference/benchmark_vllm.py new file mode 100644 index 000000000..ae367afd5 --- /dev/null +++ b/examples/inference/benchmark_vllm.py @@ -0,0 +1,89 @@ +"""Benchmark the latency of processing a single batch of requests.""" +"""Modified from https://github.com/vllm-project/vllm/blob/main/benchmarks/benchmark_latency.py +The only change is printing the throughput. +""" +import argparse +import time + +import numpy as np +import torch +from tqdm import tqdm +from vllm import LLM, SamplingParams + + +def main(args: argparse.Namespace): + # Process all the requests in a single batch if possible. + # NOTE(woosuk): If the request cannot be processed in a single batch, + # the engine will automatically process the request in multiple batches. + llm = LLM( + model=args.model, + tokenizer=args.tokenizer, + quantization=args.quantization, + tensor_parallel_size=args.tensor_parallel_size, + max_num_seqs=args.batch_size, + max_num_batched_tokens=args.batch_size * args.input_len, + trust_remote_code=args.trust_remote_code, + dtype=args.dtype, + ) + + sampling_params = SamplingParams( + n=args.n, + temperature=0.0 if args.use_beam_search else 1.0, + top_p=1.0, + use_beam_search=args.use_beam_search, + ignore_eos=True, + max_tokens=args.output_len, + ) + dummy_prompt_token_ids = [[0] * args.input_len] * args.batch_size + + def run_to_completion(profile: bool = False): + if profile: + torch.cuda.cudart().cudaProfilerStart() + start_time = time.perf_counter() + + llm.generate(prompt_token_ids=dummy_prompt_token_ids, sampling_params=sampling_params, use_tqdm=False) + + end_time = time.perf_counter() + latency = end_time - start_time + if profile: + torch.cuda.cudart().cudaProfilerStop() + return latency + + print("Warming up...") + run_to_completion(profile=False) + + # Benchmark. + latencies = [] + for _ in tqdm(range(args.num_iters), desc="Profiling iterations"): + latencies.append(run_to_completion(profile=False)) + print(f"Avg latency: {np.mean(latencies)} seconds") + print(f"Avg throughput: {args.batch_size*args.output_len/np.mean(latencies)} tokens/s") + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="Benchmark the latency of processing a single batch of " "requests till completion." + ) + parser.add_argument("--model", type=str, default="facebook/opt-125m") + parser.add_argument("--tokenizer", type=str, default=None) + parser.add_argument("--quantization", "-q", choices=["awq", "squeezellm", None], default=None) + parser.add_argument("--tensor-parallel-size", "-tp", type=int, default=1) + parser.add_argument("--input-len", type=int, default=32) + parser.add_argument("--output-len", type=int, default=128) + parser.add_argument("--batch-size", type=int, default=8) + parser.add_argument("--n", type=int, default=1, help="Number of generated sequences per prompt.") + parser.add_argument("--use-beam-search", action="store_true") + parser.add_argument("--num-iters", type=int, default=3, help="Number of iterations to run.") + parser.add_argument("--trust-remote-code", action="store_true", help="trust remote code from huggingface") + parser.add_argument( + "--dtype", + type=str, + default="auto", + choices=["auto", "half", "float16", "bfloat16", "float", "float32"], + help="data type for model weights and activations. " + 'The "auto" option will use FP16 precision ' + "for FP32 and FP16 models, and BF16 precision " + "for BF16 models.", + ) + args = parser.parse_args() + main(args)