feat(model): Support MLX inference (#2781)

2025-09-14 13:40:54 +00:00 · 2025-06-19 09:30:58 +08:00
parent 9084c6c19c
commit d9d4d4b6bc
12 changed files with 5047 additions and 4662 deletions
--- a/configs/dbgpt-local-mlx.toml
+++ b/configs/dbgpt-local-mlx.toml
@@ -0,0 +1,36 @@
 [system]
 # Load language from environment variable(It is set by the hook)
 language = "${env:DBGPT_LANG:-zh}"
 api_keys = []
 encrypt_key = "your_secret_key"
 # Server Configurations
 [service.web]
 host = "0.0.0.0"
 port = 5670
 [service.web.database]
 type = "sqlite"
 path = "pilot/meta_data/dbgpt.db"
 [rag.storage]
 [rag.storage.vector]
 type = "chroma"
 persist_path = "pilot/data"
 # Model Configurations
 [models]
 [[models.llms]]
 name = "Qwen/Qwen3-0.6B-MLX-4bit"
 provider = "mlx"
 # If not provided, the model will be downloaded from the Hugging Face model hub
 # uncomment the following line to specify the model path in the local file system
 # https://huggingface.co/Qwen/Qwen3-0.6B-MLX-4bit
 # path = "the-model-path-in-the-local-file-system"
 [[models.embeddings]]
 name = "BAAI/bge-large-zh-v1.5"
 provider = "hf"
 # If not provided, the model will be downloaded from the Hugging Face model hub
 # uncomment the following line to specify the model path in the local file system
 # path = "the-model-path-in-the-local-file-system"
--- a/docs/docs/installation/advanced_usage/mlx_inference.md
+++ b/docs/docs/installation/advanced_usage/mlx_inference.md
@@ -0,0 +1,43 @@
 # MLX Inference
 DB-GPT supports [MLX](https://github.com/ml-explore/mlx-lm) inference, a fast and easy-to-use LLM inference and service library.
 ## Install dependencies
 `MLX` is an optional dependency in DB-GPT. You can install it by adding the extra `--extra "mlx"` when installing dependencies.
 ```bash
 # Use uv to install dependencies needed for mlx
 # Install core dependencies and select desired extensions
 uv sync --all-packages \
 --extra "base" \
 --extra "hf" \
 --extra "mlx" \
 --extra "rag" \
 --extra "storage_chromadb" \
 --extra "quant_bnb" \
 --extra "dbgpts"
 ```
 ## Modify configuration file
 After installing the dependencies, you can modify your configuration file to use the `mlx` provider.
 ```toml
 # Model Configurations
 [models]
 [[models.llms]]
 name = "Qwen/Qwen3-0.6B-MLX-4bit"
 provider = "mlx"
 # If not provided, the model will be downloaded from the Hugging Face model hub
 # uncomment the following line to specify the model path in the local file system
 # https://huggingface.co/Qwen/Qwen3-0.6B-MLX-4bit
 # path = "the-model-path-in-the-local-file-system"
 ```
 ### Step 3: Run the Model
 You can run the model using the following command:
 ```bash
 uv run dbgpt start webserver --config {your_config_file}
 ```
--- a/docs/sidebars.js
+++ b/docs/sidebars.js
@@ -170,6 +170,10 @@ const sidebars = {
              type: 'doc',
              id: 'installation/advanced_usage/vLLM_inference',
            },
            {
              type: 'doc',
              id: 'installation/advanced_usage/mlx_inference',
            },
            {
              type: 'doc',
              id: 'installation/advanced_usage/Llamacpp_server',
--- a/docs/yarn.lock
+++ b/docs/yarn.lock
--- a/packages/dbgpt-accelerator/dbgpt-acc-auto/pyproject.toml
+++ b/packages/dbgpt-accelerator/dbgpt-acc-auto/pyproject.toml
@@ -68,6 +68,9 @@ vllm = [
    # Just support GPU version on Linux
    "vllm>=0.7.0; sys_platform == 'linux'",
 ]
 mlx = [
    "mlx-lm>=0.25.2; sys_platform == 'darwin'",
 ]
 # vllm_pascal = [
 #    # https://github.com/sasha0552/pascal-pkgs-ci
 #    "vllm-pascal==0.7.2; sys_platform == 'linux'"
--- a/packages/dbgpt-app/src/dbgpt_app/dbgpt_server.py
+++ b/packages/dbgpt-app/src/dbgpt_app/dbgpt_server.py
@@ -279,7 +279,9 @@ def load_config(config_file: str = None) -> ApplicationConfig:
    from dbgpt.configs.model_config import ROOT_PATH as DBGPT_ROOT_PATH
    if config_file is None:
-        config_file = os.path.join(DBGPT_ROOT_PATH, "configs", "dbgpt-siliconflow.toml")
+        config_file = os.path.join(
            DBGPT_ROOT_PATH, "configs", "dbgpt-proxy-siliconflow.toml"
        )
    elif not os.path.isabs(config_file):
        # If config_file is a relative path, make it relative to DBGPT_ROOT_PATH
        config_file = os.path.join(DBGPT_ROOT_PATH, config_file)
--- a/packages/dbgpt-core/src/dbgpt/model/init.py
+++ b/packages/dbgpt-core/src/dbgpt/model/init.py
@@ -37,6 +37,7 @@ def scan_model_providers():
        base_class=LLMDeployModelParameters,
        specific_files=[
            "vllm_adapter",
            "mlx_adapter",
            "hf_adapter",
            "llama_cpp_adapter",
            "llama_cpp_py_adapter",
--- a/packages/dbgpt-core/src/dbgpt/model/adapter/base.py
+++ b/packages/dbgpt-core/src/dbgpt/model/adapter/base.py
@@ -270,14 +270,18 @@ class LLMModelAdapter(ABC):
        ):
            return True
        return (
-            lower_model_name_or_path
+            (
-            and "deepseek" in lower_model_name_or_path
+                lower_model_name_or_path
-            and (
+                and "deepseek" in lower_model_name_or_path
-                "r1" in lower_model_name_or_path
+                and (
-                or "reasoning" in lower_model_name_or_path
+                    "r1" in lower_model_name_or_path
-                or "reasoner" in lower_model_name_or_path
+                    or "reasoning" in lower_model_name_or_path
                    or "reasoner" in lower_model_name_or_path
                )
            )
-        ) or (lower_model_name_or_path and "qwq" in lower_model_name_or_path)
+            or (lower_model_name_or_path and "qwq" in lower_model_name_or_path)
            or (lower_model_name_or_path and "qwen3" in lower_model_name_or_path)
        )
    def support_async(self) -> bool:
        """Whether the loaded model supports asynchronous calls"""
--- a/packages/dbgpt-core/src/dbgpt/model/adapter/mlx_adapter.py
+++ b/packages/dbgpt-core/src/dbgpt/model/adapter/mlx_adapter.py
@@ -0,0 +1,126 @@
 import logging
 from dataclasses import dataclass, field
 from typing import Any, Dict, List, Optional, Type
 from dbgpt.core import ModelMessage
 from dbgpt.core.interface.parameter import LLMDeployModelParameters
 from dbgpt.model.adapter.base import LLMModelAdapter, register_model_adapter
 from dbgpt.model.adapter.model_metadata import COMMON_HF_MODELS
 from dbgpt.model.adapter.template import ConversationAdapter
 from dbgpt.model.base import ModelType
 from dbgpt.util.i18n_utils import _
 logger = logging.getLogger(__name__)
@dataclass
 class MLXDeployModelParameters(LLMDeployModelParameters):
    """Local deploy model parameters."""
    provider: str = "mlx"
    path: Optional[str] = field(
        default=None,
        metadata={
            "order": -800,
            "help": _("The path of the model, if you want to deploy a local model."),
        },
    )
    device: Optional[str] = field(
        default="auto",
        metadata={
            "order": -700,
            "help": _(
                "Device to run model. If None, the device is automatically determined"
            ),
        },
    )
    concurrency: Optional[int] = field(
        default=100, metadata={"help": _("Model concurrency limit")}
    )
    @property
    def real_model_path(self) -> Optional[str]:
        """Get the real model path.
        If deploy model is not local, return None.
        """
        return self._resolve_root_path(self.path)
    @property
    def real_device(self) -> Optional[str]:
        """Get the real device."""
        return self.device or super().real_device
 class MLXModelAdapter(LLMModelAdapter):
    def match(
        self,
        provider: str,
        model_name: Optional[str] = None,
        model_path: Optional[str] = None,
    ) -> bool:
        return provider == ModelType.MLX
    def model_type(self) -> str:
        return ModelType.MLX
    def model_param_class(
        self, model_type: str = None
    ) -> Type[MLXDeployModelParameters]:
        return MLXDeployModelParameters
    def get_default_conv_template(
        self, model_name: str, model_path: str
    ) -> Optional[ConversationAdapter]:
        return None
    def load_from_params(self, params: MLXDeployModelParameters):
        """Load model from parameters."""
        try:
            from mlx_lm import load
        except ImportError:
            logger.error(
                "MLX model adapter requires mlx_lm package. "
                "Please install it with `pip install mlx-lm`."
            )
            raise
        model_path = params.real_model_path
        model, tokenizer = load(model_path)
        return model, tokenizer
    def support_generate_function(self) -> bool:
        return False
    def get_generate_stream_function(
        self, model, deploy_model_params: LLMDeployModelParameters
    ):
        from dbgpt.model.llm.llm_out.mlx_llm import generate_stream
        return generate_stream
    def get_str_prompt(
        self,
        params: Dict,
        messages: List[ModelMessage],
        tokenizer: Any,
        prompt_template: str = None,
        convert_to_compatible_format: bool = False,
    ) -> Optional[str]:
        if not tokenizer:
            raise ValueError("tokenizer is is None")
        if hasattr(tokenizer, "apply_chat_template"):
            messages = self.transform_model_messages(
                messages, convert_to_compatible_format
            )
            logger.debug(f"The messages after transform: \n{messages}")
            str_prompt = tokenizer.apply_chat_template(
                messages, tokenize=False, add_generation_prompt=True
            )
            return str_prompt
        return None
 register_model_adapter(MLXModelAdapter, supported_models=COMMON_HF_MODELS)
--- a/packages/dbgpt-core/src/dbgpt/model/base.py
+++ b/packages/dbgpt-core/src/dbgpt/model/base.py
@@ -17,6 +17,7 @@ class ModelType:
    LLAMA_CPP_SERVER = "llama.cpp.server"
    PROXY = "proxy"
    VLLM = "vllm"
    MLX = "mlx"
    # TODO, support more model type
--- a/packages/dbgpt-core/src/dbgpt/model/llm/llm_out/mlx_llm.py
+++ b/packages/dbgpt-core/src/dbgpt/model/llm/llm_out/mlx_llm.py
@@ -0,0 +1,110 @@
 from typing import Dict
 import mlx.nn as nn
 from mlx_lm import stream_generate
 from mlx_lm.sample_utils import make_sampler
 from mlx_lm.tokenizer_utils import TokenizerWrapper
 from dbgpt.core import ModelOutput
 from ...utils.llm_metrics import LLMPerformanceMonitor
 from ...utils.parse_utils import (
    _DEFAULT_THINK_END_TOKEN,
    _DEFAULT_THINK_START_TOKEN,
    parse_chat_message,
 )
 def generate_stream(
    model: nn.Module,
    tokenizer: TokenizerWrapper,
    params: Dict,
    device: str,
    context_len: int,
 ):
    prompt = params["prompt"]
    temperature = float(params.get("temperature", 0))
    top_p = float(params.get("top_p", 1.0))
    top_k = params.get("top_k", 0)
    max_new_tokens = int(params.get("max_new_tokens", 2048))
    # echo = bool(params.get("echo", True))
    think_start_token = params.get("think_start_token", _DEFAULT_THINK_START_TOKEN)
    think_end_token = params.get("think_end_token", _DEFAULT_THINK_END_TOKEN)
    is_reasoning_model = params.get("is_reasoning_model", False)
    reasoning_patterns = [
        {"start": think_start_token, "end": think_end_token},
    ]
    sampler = make_sampler(
        temp=temperature,
        top_p=top_p,
        # min_p=min_p,
        # min_tokens_to_keep=min_tokens_to_keep,
        top_k=top_k,
        xtc_special_tokens=tokenizer.encode("\n") + list(tokenizer.eos_token_ids),
    )
    # Initialize the performance monitor with estimated token count
    estimated_input_tokens = len(tokenizer.encode(prompt))
    perf_monitor = LLMPerformanceMonitor(input_token_count=estimated_input_tokens)
    # Start measuring prefill phase
    perf_monitor.start_prefill()
    results_generator = stream_generate(
        model, tokenizer, prompt=prompt, max_tokens=max_new_tokens, sampler=sampler
    )
    text = ""
    is_first = True
    for res in results_generator:
        new_text = res.text
        text += new_text
        # The prompt processing tokens-per-second.
        # prompt_tps = res.prompt_tps
        # The number of tokens in the prompt.
        prompt_tokens = res.prompt_tokens
        # The number of generated tokens.
        generation_tokens = res.generation_tokens
        # The tokens-per-second for generation.
        # generation_tps = res.generation_tps
        # The peak memory used so far in GB.
        # peak_memory = res.peak_memory
        # "length", "stop" or `None`
        finish_reason = res.finish_reason
        if (
            prompt.rstrip().endswith(think_start_token)
            and is_reasoning_model
            and is_first
        ):
            text = think_start_token + "\n" + text
            is_first = False
        msg = parse_chat_message(
            text,
            extract_reasoning=is_reasoning_model,
            reasoning_patterns=reasoning_patterns,
        )
        # If this is the first iteration, update the input token count
        if perf_monitor.metrics.input_token_count != prompt_tokens:
            perf_monitor.metrics.input_token_count = prompt_tokens
        # Update performance metrics based on current token count
        perf_metrics = perf_monitor.on_tokens_received(generation_tokens)
        usage = {
            "prompt_tokens": prompt_tokens,
            "completion_tokens": generation_tokens,
            "total_tokens": prompt_tokens + generation_tokens,
        }
        # Check if generation is complete
        is_complete = finish_reason is not None
        if is_complete:
            perf_monitor.end_generation()
        usage.update(perf_metrics)
        yield ModelOutput.build(
            msg.content,
            msg.reasoning_content,
            error_code=0,
            usage=usage,
            is_reasoning_model=is_reasoning_model,
        )
--- a/uv.lock
+++ b/uv.lock