[Inference]Update inference config and fix test (#5178)

* unify the config setting * fix test * fix import * fix test * fix * fix * add logger * revise log info --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
2025-09-13 21:22:49 +00:00 · 2023-12-12 17:22:41 +08:00
parent 3de2e62299
commit 93aeacca34
9 changed files with 61 additions and 25 deletions
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -0,0 +1,84 @@
+import logging
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+GibiByte = 1024**3
+
+logger = logging.Logger(__name__)
+
+
+@dataclass
+class InferenceConfig:
+    """The inference configuration.
+
+    Args:
+        model: Path or nn.Module of this model.
+        tokenizer: Path of the tokenizer to use.
+        tokenizer_mode: "auto" will use the fast tokenizer if available, and "slow" will always use the slow tokenizer.
+        trust_remote_code: Whether to trust remote code from huggingface.
+        max_batch_size: Maximum batch size.
+        max_output_len: Maximum output length.
+        max_input_len: Maximum input length.
+        block_size: The number of blocks in a logical block.
+        dtype: The data type for weights and activations.
+        tp_size: Tensor parallel size.
+        pp_size: Pipeline parallel size.
+        max_seq_len: Maximum length of input sentence.
+        quant_mode: Quantization mode.
+        revision: The specific version(a branch, name, a commit id, or a tag name) of model to use.
+        beam_width: The maximum beam width used to initialize KV Cache.
+            During generation, the beam width provided as sampling parameter should be less than or equivalent to this value.
+        prefill_ratio: A controling ratio for prefill and decoding in running list, we will do a step of prefill
+            when the actual value exceeds this ratio.
+    """
+
+    model: Union[str, nn.Module]
+    tokenizer: str = None
+    tokenizer_mode: str = "auto"
+    trust_remote_code: bool = False
+    max_batch_size: int = None
+    max_output_len: int = 256
+    max_input_len: int = 256
+    block_size: int = 16
+    dtype: Union[str, torch.dtype] = torch.float32
+    tp_size: int = 1
+    pp_size: int = 1
+    max_seq_len: Optional[int] = None
+    quant_mode: Optional[str] = None
+    revision: Optional[str] = None
+    beam_width: int = 1
+    # TODO: beam search is not support for now
+    prefill_ratio: Optional[float] = 1.2
+    # the ratio of prefill sequences to decoding sequences, we do prefill step once the actual value exceeds ratio
+
+    def _init_batch_size(self):
+        """
+        MAX_BATCH_SIZE is set to acurately utilize the memory of gpu.
+        We take a simple method to determine it by GPU memory size, user can still set it manually.
+        """
+        if self.max_batch_size is not None:
+            # already set by user
+            return
+
+        device = torch.device("cuda")
+        total_mem = torch.cuda.get_device_properties(device).total_memory // GibiByte
+        self.max_batch_size = 8
+
+        if 40 < total_mem <= 60:
+            self.max_batch_size = 16
+        elif 60 < total_mem <= 80:
+            self.max_batch_size = 32
+        logger.info(
+            f"The maximum batch size is automatically set to {self.max_batch_size} as no value is provided by the user."
+        )
+
+    def __post_init__(self):
+        self._init_batch_size()
+        self._verify_args()
+
+    def _verify_args(self):
+        if self.tokenizer_mode not in ["auto", "slow"]:
+            raise ValueError("Tokenizer mode must be " "either 'auto' or 'slow'," f"but got {self.tokenizer_mode}")