[Inference]Add BatchInferState, Sequence and InferConfig (#5149)

* add infer_struct and infer_config * update codes * change InferConfig * Add hf_model_config to the engine * rm _get_hf_model_config * update codes * made adjustments according to the feedback from the reviewer. * update codes * add ci test for config and struct
2025-09-21 01:24:04 +00:00 · 2023-12-07 14:34:01 +08:00
parent 2bb92243d4
commit fab9b931d9
5 changed files with 279 additions and 34 deletions
--- a/colossalai/inference/core/engine.py
+++ b/colossalai/inference/core/engine.py
@@ -1,12 +1,14 @@
 from logging import Logger
 from typing import Optional

-from .request_handler import RequestHandler
+from transformers import AutoConfig
+
+from .config import InferenceConfig


-class InferEngine:
+class InferenceEngine:
    """
-    InferEngine is the core component for Inference.
+    InferenceEngine is the core component for Inference.

    It is responsible for launch the inference process, including:
        - Initialize model and distributed training environment(if needed)
@@ -15,37 +17,27 @@ class InferEngine:
        - Log the generation process

    Args:
-        colossal_config: We provide a unified config api for that wrapped all the configs. You can use it to replace the below configs.
-        model_config : The configuration for the model.
-        parallel_config: The configuration for parallelize model.
-        cache_config : Configuration for initialize and manage kv cache.
-        tokenizer (Tokenizer): The tokenizer to be used for inference.
-        use_logger (bool): Determine whether or not to log the generation process.
+        tokenizer: Path of the tokenizer to use.
+        inference_config: We provide a unified config api for that wrapped all the configs. You can use it to replace the below configs.
+        verbose (bool): Determine whether or not to log the generation process.
    """

    def __init__(
        self,
-        model_config,
-        cache_config,
-        parallel_config,
-        tokenizer,
-        use_logger: bool = False,
-        colossal_config: Optional["ColossalInferConfig"] = None,
+        tokenizer: str = None,
+        inference_config: Optional["InferenceConfig"] = None,
+        verbose: bool = False,
    ) -> None:
-        assert colossal_config or (
-            model_config and cache_config and parallel_config
-        ), "Please provide colossal_config or model_config, cache_config, parallel_config"
-        if colossal_config:
-            model_config, cache_config, parallel_config = colossal_config
-
-        self.model_config = model_config
-        self.cache_config = cache_config
-        self.parallel_config = parallel_config
-        self._verify_config()
+        assert inference_config, "Please provide inference_config."

        self._init_model()
-        self.request_handler = RequestHandler(cache_config)
-        if use_logger:
+        # cache_config may need to be modified later.
+        # self.request_handler = RequestHandler(cache_config)
+        self.tokenizer = tokenizer
+        self.hf_model_config = AutoConfig.from_pretrained(
+            self.model, trust_remote_code=self.trust_remote_code, revision=self.revision
+        )
+        if verbose:
            self.logger = Logger()

    def _init_model(self):