refactor: The first refactored version for sdk release (#907)

Co-authored-by: chengfangyin2 <chengfangyin3@jd.com>
2025-09-13 05:01:25 +00:00 · 2023-12-08 14:45:59 +08:00
parent e7e4aff667
commit cd725db1fb
573 changed files with 2094 additions and 3571 deletions
--- a/dbgpt/util/benchmarks/init.py
+++ b/dbgpt/util/benchmarks/init.py
--- a/dbgpt/util/benchmarks/llm/init.py
+++ b/dbgpt/util/benchmarks/llm/init.py
--- a/dbgpt/util/benchmarks/llm/fastchat_benchmarks_inference.py
+++ b/dbgpt/util/benchmarks/llm/fastchat_benchmarks_inference.py
@@ -0,0 +1,296 @@
+"""
+Adapted from fastchat: https://github.com/lm-sys/FastChat/blob/main/fastchat/serve/inference.py.
+For benchmarks.
+
+"""
+import gc
+from typing import Iterable, Dict
+
+import torch
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    RepetitionPenaltyLogitsProcessor,
+    TemperatureLogitsWarper,
+    TopKLogitsWarper,
+    TopPLogitsWarper,
+)
+
+
+from fastchat.utils import is_partial_stop, is_sentence_complete, get_context_length
+
+
+def prepare_logits_processor(
+    temperature: float, repetition_penalty: float, top_p: float, top_k: int
+) -> LogitsProcessorList:
+    processor_list = LogitsProcessorList()
+    # TemperatureLogitsWarper doesn't accept 0.0, 1.0 makes it a no-op so we skip two cases.
+    if temperature >= 1e-5 and temperature != 1.0:
+        processor_list.append(TemperatureLogitsWarper(temperature))
+    if repetition_penalty > 1.0:
+        processor_list.append(RepetitionPenaltyLogitsProcessor(repetition_penalty))
+    if 1e-8 <= top_p < 1.0:
+        processor_list.append(TopPLogitsWarper(top_p))
+    if top_k > 0:
+        processor_list.append(TopKLogitsWarper(top_k))
+    return processor_list
+
+
+@torch.inference_mode()
+def generate_stream(
+    model,
+    tokenizer,
+    params: Dict,
+    device: str,
+    context_len: int,
+    stream_interval: int = 1,
+    judge_sent_end: bool = False,
+):
+    if hasattr(model, "device"):
+        device = model.device
+
+    # Read parameters
+    prompt = params["prompt"]
+    len_prompt = len(prompt)
+    temperature = float(params.get("temperature", 1.0))
+    repetition_penalty = float(params.get("repetition_penalty", 1.0))
+    top_p = float(params.get("top_p", 1.0))
+    top_k = int(params.get("top_k", -1))  # -1 means disable
+    max_new_tokens = int(params.get("max_new_tokens", 256))
+    logprobs = params.get("logprobs", None)  # FIXME: Support logprobs>1.
+    echo = bool(params.get("echo", True))
+    stop_str = params.get("stop", None)
+    stop_token_ids = params.get("stop_token_ids", None) or []
+    if tokenizer.eos_token_id not in stop_token_ids:
+        stop_token_ids.append(tokenizer.eos_token_id)
+
+    logits_processor = prepare_logits_processor(
+        temperature, repetition_penalty, top_p, top_k
+    )
+    input_ids = tokenizer(prompt).input_ids
+
+    if model.config.is_encoder_decoder:
+        max_src_len = context_len
+    else:  # truncate
+        max_src_len = context_len - max_new_tokens - 1
+
+    input_ids = input_ids[-max_src_len:]
+    output_ids = list(input_ids)
+    input_echo_len = len(input_ids)
+
+    # Don't stop generate until max_new_tokens is reached.
+    stop_token_ids = []
+    stop_str = None
+
+    if model.config.is_encoder_decoder:
+        if logprobs is not None:  # FIXME: Support logprobs for encoder-decoder models.
+            raise NotImplementedError
+        encoder_output = model.encoder(
+            input_ids=torch.as_tensor([input_ids], device=device)
+        )[0]
+        start_ids = torch.as_tensor(
+            [[model.generation_config.decoder_start_token_id]],
+            dtype=torch.int64,
+            device=device,
+        )
+    else:
+        start_ids = torch.as_tensor([input_ids], device=device)
+
+    past_key_values = out = None
+    token_logprobs = [None]  # The first token has no logprobs.
+    sent_interrupt = False
+    finish_reason = None
+    for i in range(max_new_tokens):
+        if i == 0:  # prefill
+            if model.config.is_encoder_decoder:
+                out = model.decoder(
+                    input_ids=start_ids,
+                    encoder_hidden_states=encoder_output,
+                    use_cache=True,
+                )
+                logits = model.lm_head(out[0])
+            else:
+                out = model(input_ids=start_ids, use_cache=True)
+                logits = out.logits
+            past_key_values = out.past_key_values
+
+            if logprobs is not None:
+                # Prefull logprobs for the prompt.
+                shift_input_ids = start_ids[..., 1:].contiguous()
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_logits = torch.log_softmax(shift_logits, dim=-1).tolist()
+                for label_id, logit in zip(
+                    shift_input_ids[0].tolist(), shift_logits[0]
+                ):
+                    token_logprobs.append(logit[label_id])
+        else:  # decoding
+            if model.config.is_encoder_decoder:
+                out = model.decoder(
+                    input_ids=torch.as_tensor(
+                        [[token] if not sent_interrupt else output_ids],
+                        device=device,
+                    ),
+                    encoder_hidden_states=encoder_output,
+                    use_cache=True,
+                    past_key_values=past_key_values if not sent_interrupt else None,
+                )
+                sent_interrupt = False
+
+                logits = model.lm_head(out[0])
+            else:
+                out = model(
+                    input_ids=torch.as_tensor(
+                        [[token] if not sent_interrupt else output_ids],
+                        device=device,
+                    ),
+                    use_cache=True,
+                    past_key_values=past_key_values if not sent_interrupt else None,
+                )
+                sent_interrupt = False
+                logits = out.logits
+            past_key_values = out.past_key_values
+
+        if logits_processor:
+            if repetition_penalty > 1.0:
+                tmp_output_ids = torch.as_tensor([output_ids], device=logits.device)
+            else:
+                tmp_output_ids = None
+            last_token_logits = logits_processor(tmp_output_ids, logits[:, -1, :])[0]
+        else:
+            last_token_logits = logits[0, -1, :]
+
+        if device == "mps":
+            # Switch to CPU by avoiding some bugs in mps backend.
+            last_token_logits = last_token_logits.float().to("cpu")
+
+        if temperature < 1e-5 or top_p < 1e-8:  # greedy
+            _, indices = torch.topk(last_token_logits, 2)
+            tokens = [int(index) for index in indices.tolist()]
+        else:
+            probs = torch.softmax(last_token_logits, dim=-1)
+            indices = torch.multinomial(probs, num_samples=2)
+            tokens = [int(token) for token in indices.tolist()]
+        token = tokens[0]
+        output_ids.append(token)
+        if logprobs is not None:
+            # Cannot use last_token_logits because logprobs is based on raw logits.
+            token_logprobs.append(
+                torch.log_softmax(logits[0, -1, :], dim=-1)[token].tolist()
+            )
+
+        if token in stop_token_ids:
+            stopped = True
+        else:
+            stopped = False
+
+        # Yield the output tokens
+        if i % stream_interval == 0 or i == max_new_tokens - 1 or stopped:
+            if echo:
+                tmp_output_ids = output_ids
+                rfind_start = len_prompt
+            else:
+                tmp_output_ids = output_ids[input_echo_len:]
+                rfind_start = 0
+
+            output = tokenizer.decode(
+                tmp_output_ids,
+                skip_special_tokens=True,
+                spaces_between_special_tokens=False,
+                clean_up_tokenization_spaces=True,
+            )
+            ret_logprobs = None
+            if logprobs is not None:
+                ret_logprobs = {
+                    "text_offset": [],
+                    "tokens": [
+                        tokenizer.decode(token)
+                        for token in (
+                            output_ids if echo else output_ids[input_echo_len:]
+                        )
+                    ],
+                    "token_logprobs": token_logprobs
+                    if echo
+                    else token_logprobs[input_echo_len:],
+                    "top_logprobs": [{}]
+                    * len(token_logprobs if echo else token_logprobs[input_echo_len:]),
+                }
+                # Compute text_offset
+                curr_pos = 0
+                for text in ret_logprobs["tokens"]:
+                    ret_logprobs["text_offset"].append(curr_pos)
+                    curr_pos += len(text)
+
+            # TODO: For the issue of incomplete sentences interrupting output, apply a patch and others can also modify it to a more elegant way
+            if judge_sent_end and stopped and not is_sentence_complete(output):
+                if len(tokens) > 1:
+                    token = tokens[1]
+                    output_ids[-1] = token
+                else:
+                    output_ids.pop()
+                stopped = False
+                sent_interrupt = True
+
+            partially_stopped = False
+            if stop_str:
+                if isinstance(stop_str, str):
+                    pos = output.rfind(stop_str, rfind_start)
+                    if pos != -1:
+                        output = output[:pos]
+                        stopped = True
+                    else:
+                        partially_stopped = is_partial_stop(output, stop_str)
+                elif isinstance(stop_str, Iterable):
+                    for each_stop in stop_str:
+                        pos = output.rfind(each_stop, rfind_start)
+                        if pos != -1:
+                            output = output[:pos]
+                            stopped = True
+                            break
+                        else:
+                            partially_stopped = is_partial_stop(output, each_stop)
+                            if partially_stopped:
+                                break
+                else:
+                    raise ValueError("Invalid stop field type.")
+
+            # Prevent yielding partial stop sequence
+            if not partially_stopped:
+                yield {
+                    "text": output,
+                    "logprobs": ret_logprobs,
+                    "usage": {
+                        "prompt_tokens": input_echo_len,
+                        "completion_tokens": i,
+                        "total_tokens": input_echo_len + i,
+                    },
+                    "finish_reason": None,
+                }
+
+        if stopped:
+            break
+
+    # Finish stream event, which contains finish reason
+    else:
+        finish_reason = "length"
+
+    if stopped:
+        finish_reason = "stop"
+
+    yield {
+        "text": output,
+        "logprobs": ret_logprobs,
+        "usage": {
+            "prompt_tokens": input_echo_len,
+            "completion_tokens": i,
+            "total_tokens": input_echo_len + i,
+        },
+        "finish_reason": finish_reason,
+    }
+
+    # Clean
+    del past_key_values, out
+    gc.collect()
+    torch.cuda.empty_cache()
+    if device == "xpu":
+        torch.xpu.empty_cache()
+    if device == "npu":
+        torch.npu.empty_cache()
--- a/dbgpt/util/benchmarks/llm/llm_benchmarks.py
+++ b/dbgpt/util/benchmarks/llm/llm_benchmarks.py
@@ -0,0 +1,282 @@
+from typing import Dict, List
+import asyncio
+import os
+import sys
+import time
+import csv
+import argparse
+import logging
+import traceback
+from dbgpt.configs.model_config import ROOT_PATH, LLM_MODEL_CONFIG
+from datetime import datetime
+
+from dbgpt.model.cluster.worker.manager import (
+    run_worker_manager,
+    initialize_worker_manager_in_client,
+    WorkerManager,
+)
+
+from dbgpt.core import ModelOutput, ModelInferenceMetrics
+from dbgpt.core.interface.message import ModelMessage, ModelMessageRoleType
+
+
+model_name = "vicuna-7b-v1.5"
+model_path = LLM_MODEL_CONFIG[model_name]
+# or vllm
+model_type = "huggingface"
+
+controller_addr = "http://127.0.0.1:5000"
+
+result_csv_file = None
+
+parallel_nums = [1, 2, 4, 16, 32]
+# parallel_nums = [1, 2, 4]
+
+
+def get_result_csv_file() -> str:
+    return os.path.join(
+        ROOT_PATH, f"pilot/data/{model_name}_{model_type}_benchmarks_llm.csv"
+    )
+
+
+input_lens = [64, 64]
+output_lens = [256, 512]
+
+
+prompt_file_map = {
+    "11k": os.path.join(
+        ROOT_PATH, "docker/examples/benchmarks/benchmarks_llm_11k_prompt.txt"
+    )
+}
+
+METRICS_HEADERS = [
+    # Params
+    "model_name",
+    "gpu_nums",
+    "parallel_nums",
+    "input_length",
+    "output_length",
+    # Merge parallel result
+    "test_time_cost_ms",
+    "test_total_tokens",
+    # avg_test_speed_per_second: (tokens / s), test_total_tokens / (test_time_cost_ms / 1000.0)
+    "avg_test_speed_per_second(tokens/s)",
+    # avg_first_token_latency_ms: sum(first_token_time_ms) / parallel_nums
+    "avg_first_token_latency_ms",
+    # avg_latency_ms: sum(end_time_ms - start_time_ms) / parallel_nums
+    "avg_latency_ms",
+    "gpu_mem(GiB)",
+    # Detail for each task
+    "start_time_ms",
+    "end_time_ms",
+    "current_time_ms",
+    "first_token_time_ms",
+    "first_completion_time_ms",
+    "first_completion_tokens",
+    "prompt_tokens",
+    "completion_tokens",
+    "total_tokens",
+    "speed_per_second",
+]
+
+
+def read_prompt_from_file(file_key: str) -> str:
+    full_path = prompt_file_map[file_key]
+    with open(full_path, "r+", encoding="utf-8") as f:
+        return f.read()
+
+
+def build_param(
+    input_len: int,
+    output_len: int,
+    user_input: str,
+    system_prompt: str = None,
+) -> Dict:
+    hist = []
+    if system_prompt is not None:
+        hist.append(
+            ModelMessage(role=ModelMessageRoleType.SYSTEM, content=system_prompt)
+        )
+    hist.append(ModelMessage(role=ModelMessageRoleType.HUMAN, content=user_input))
+    hist = list(h.dict() for h in hist)
+    context_len = input_len + output_len + 2
+    params = {
+        "prompt": user_input,
+        "messages": hist,
+        "model": model_name,
+        "echo": False,
+        "max_new_tokens": output_len,
+        "context_len": context_len,
+    }
+    return params
+
+
+async def run_batch(
+    wh: WorkerManager,
+    input_len: int,
+    output_len: int,
+    parallel_num: int,
+    output_file: str,
+):
+    tasks = []
+    prompt = read_prompt_from_file("11k")
+    if model_type == "vllm":
+        max_input_str_len = input_len
+        if "baichuan" in model_name:
+            # TODO prompt handle first
+            max_input_str_len *= 2
+        prompt = prompt[-max_input_str_len:]
+
+    # Warmup first
+    params = build_param(input_len, output_len, prompt, system_prompt="")
+    await wh.generate(params)
+
+    for _ in range(parallel_num):
+        params = build_param(input_len, output_len, prompt, system_prompt="")
+        tasks.append(wh.generate(params))
+    print(
+        f"Begin run benchmarks, model name: {model_name}, input_len: {input_len}, output_len: {output_len}, parallel_num: {parallel_num}, save result to {output_file}"
+    )
+    start_time_ms = time.time_ns() // 1_000_000
+    results: List[ModelOutput] = await asyncio.gather(*tasks)
+    end_time_ms = time.time_ns() // 1_000_000
+
+    test_time_cost_ms = end_time_ms - start_time_ms
+    test_total_tokens = 0
+    first_token_latency_ms = 0
+    latency_ms = 0
+    gpu_nums = 0
+    avg_gpu_mem = 0
+    rows = []
+    for r in results:
+        metrics = r.metrics
+        if isinstance(metrics, dict):
+            metrics = ModelInferenceMetrics(**metrics)
+        print(r)
+        test_total_tokens += metrics.total_tokens
+        first_token_latency_ms += metrics.first_token_time_ms - metrics.start_time_ms
+        latency_ms += metrics.end_time_ms - metrics.start_time_ms
+        row_data = metrics.to_dict()
+        del row_data["collect_index"]
+        if "avg_gpu_infos" in row_data:
+            avg_gpu_infos = row_data["avg_gpu_infos"]
+            gpu_nums = len(avg_gpu_infos)
+            avg_gpu_mem = (
+                sum(i["allocated_memory_gb"] for i in avg_gpu_infos) / gpu_nums
+            )
+            del row_data["avg_gpu_infos"]
+            del row_data["current_gpu_infos"]
+        rows.append(row_data)
+    avg_test_speed_per_second = test_total_tokens / (test_time_cost_ms / 1000.0)
+    avg_first_token_latency_ms = first_token_latency_ms / len(results)
+    avg_latency_ms = latency_ms / len(results)
+
+    with open(output_file, "a", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=METRICS_HEADERS)
+        if f.tell() == 0:
+            # Fist time
+            writer.writeheader()
+        for row in rows:
+            row["model_name"] = model_name
+            row["parallel_nums"] = parallel_num
+            row["input_length"] = input_len
+            row["output_length"] = output_len
+            row["test_time_cost_ms"] = test_time_cost_ms
+            row["test_total_tokens"] = test_total_tokens
+            row["avg_test_speed_per_second(tokens/s)"] = avg_test_speed_per_second
+            row["avg_first_token_latency_ms"] = avg_first_token_latency_ms
+            row["avg_latency_ms"] = avg_latency_ms
+            row["gpu_nums"] = gpu_nums
+            row["gpu_mem(GiB)"] = avg_gpu_mem
+            writer.writerow(row)
+    print(
+        f"input_len: {input_len}, output_len: {output_len}, parallel_num: {parallel_num}, save result to {output_file}"
+    )
+
+
+async def run_model(wh: WorkerManager) -> None:
+    global result_csv_file
+    if not result_csv_file:
+        result_csv_file = get_result_csv_file()
+    if os.path.exists(result_csv_file):
+        now = datetime.now()
+        now_str = now.strftime("%Y-%m-%d")
+        os.rename(result_csv_file, f"{result_csv_file}.bak_{now_str}.csv")
+    for parallel_num in parallel_nums:
+        for input_len, output_len in zip(input_lens, output_lens):
+            try:
+                await run_batch(
+                    wh, input_len, output_len, parallel_num, result_csv_file
+                )
+            except Exception:
+                msg = traceback.format_exc()
+                logging.error(
+                    f"Run benchmarks error, input_len: {input_len}, output_len: {output_len}, parallel_num: {parallel_num}, error message: {msg}"
+                )
+                if "torch.cuda.OutOfMemoryError" in msg:
+                    return
+
+    sys.exit(0)
+
+
+def startup_llm_env():
+    from fastapi import FastAPI
+
+    app = FastAPI()
+    initialize_worker_manager_in_client(
+        app=app,
+        model_name=model_name,
+        model_path=model_path,
+        run_locally=False,
+        controller_addr=controller_addr,
+        local_port=6000,
+        start_listener=run_model,
+    )
+
+
+def connect_to_remote_model():
+    startup_llm_env()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_name", type=str, default=model_name)
+    parser.add_argument("--model_path", type=str, default=None)
+    parser.add_argument("--model_type", type=str, default="huggingface")
+    parser.add_argument("--result_csv_file", type=str, default=None)
+    parser.add_argument("--input_lens", type=str, default="8,8,256,1024")
+    parser.add_argument("--output_lens", type=str, default="256,512,1024,1024")
+    parser.add_argument("--parallel_nums", type=str, default="1,2,4,16,32")
+    parser.add_argument(
+        "--remote_model", type=bool, default=False, help="Connect to remote model"
+    )
+    parser.add_argument("--controller_addr", type=str, default="http://127.0.0.1:8000")
+    parser.add_argument("--limit_model_concurrency", type=int, default=200)
+
+    args = parser.parse_args()
+    print(f"args: {args}")
+    model_name = args.model_name
+    model_path = args.model_path or LLM_MODEL_CONFIG[model_name]
+    result_csv_file = args.result_csv_file
+    input_lens = [int(i) for i in args.input_lens.strip().split(",")]
+    output_lens = [int(i) for i in args.output_lens.strip().split(",")]
+    parallel_nums = [int(i) for i in args.parallel_nums.strip().split(",")]
+    remote_model = args.remote_model
+    controller_addr = args.controller_addr
+    limit_model_concurrency = args.limit_model_concurrency
+    model_type = args.model_type
+    if len(input_lens) != len(output_lens):
+        raise ValueError("input_lens size must equal output_lens size")
+
+    if remote_model:
+        # Connect to remote model and run benchmarks
+        connect_to_remote_model()
+    else:
+        # Start worker manager and run benchmarks
+        run_worker_manager(
+            model_name=model_name,
+            model_path=model_path,
+            start_listener=run_model,
+            limit_model_concurrency=limit_model_concurrency,
+            model_type=model_type,
+        )