[inference] Refactor inference architecture (#5057)

* [inference] support only TP (#4998) * support only tp * enable tp * add support for bloom (#5008) * [refactor] refactor gptq and smoothquant llama (#5012) * refactor gptq and smoothquant llama * fix import error * fix linear import torch-int * fix smoothquant llama import error * fix import accelerate error * fix bug * fix import smooth cuda * fix smoothcuda * [Inference Refactor] Merge chatglm2 with pp and tp (#5023) merge chatglm with pp and tp * [Refactor] remove useless inference code (#5022) * remove useless code * fix quant model * fix test import bug * mv original inference legacy * fix chatglm2 * [Refactor] refactor policy search and quant type controlling in inference (#5035) * [Refactor] refactor policy search and quant type controling in inference * [inference] update readme (#5051) * update readme * update readme * fix architecture * fix table * fix table * [inference] udpate example (#5053) * udpate example * fix run.sh * fix rebase bug * fix some errors * update readme * add some features * update interface * update readme * update benchmark * add requirements-infer --------- Co-authored-by: Bin Jia <45593998+FoolPlayer@users.noreply.github.com> Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com>
2025-09-08 20:40:34 +00:00 · 2023-11-19 21:05:05 +08:00
parent bc09b95f50
commit fd6482ad8c
115 changed files with 6027 additions and 1431 deletions
--- a/colossalai/legacy/inference/dynamic_batching/init.py
+++ b/colossalai/legacy/inference/dynamic_batching/init.py
--- a/colossalai/legacy/inference/dynamic_batching/get_tokenizer.py
+++ b/colossalai/legacy/inference/dynamic_batching/get_tokenizer.py
@@ -0,0 +1,40 @@
+"""
+Motivated by VllM (https://github.com/vllm-project/vllm), This module is trying to resolve the tokenizer issue.
+
+license: MIT, see LICENSE for more details.
+"""
+
+from transformers import AutoTokenizer
+
+_FAST_LLAMA_TOKENIZER = "hf-internal-testing/llama-tokenizer"
+
+
+def get_tokenizer(
+    tokenizer=None,
+    tokenizer_name: str = "",
+    trust_remote_code: bool = False,
+    use_fast: bool = True,
+):
+    if tokenizer is not None:
+        tokenizer = tokenizer
+    else:
+        if "llama" in tokenizer_name.lower() and use_fast == True:
+            print(
+                "For some LLaMA-based models, initializing the fast tokenizer may "
+                "take a long time. To eliminate the initialization time, consider "
+                f"using '{_FAST_LLAMA_TOKENIZER}' instead of the original "
+                "tokenizer. This is done automatically in Colossalai."
+            )
+
+            tokenizer_name = _FAST_LLAMA_TOKENIZER
+
+        try:
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name, use_fast=use_fast, trust_remote_code=trust_remote_code
+            )
+        except TypeError:
+            use_fast = False
+            tokenizer = AutoTokenizer.from_pretrained(
+                tokenizer_name, use_fast=use_fast, trust_remote_code=trust_remote_code
+            )
+    return tokenizer
--- a/colossalai/legacy/inference/dynamic_batching/infer_batch.py
+++ b/colossalai/legacy/inference/dynamic_batching/infer_batch.py
@@ -0,0 +1,346 @@
+# Adapted from https://github.com/ModelTC/lightllm
+
+import collections
+from dataclasses import dataclass
+from typing import Dict, List, Tuple
+
+import numpy as np
+import torch
+
+from colossalai.inference.tensor_parallel import MemoryManager
+
+
+# make batch infer state an attr of InferBatch
+class InferSamplingParams:
+    def __init__(
+        self,
+        do_sample: bool = False,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,
+        vocab_size: int = -1,
+    ) -> None:
+        self.do_sample = do_sample
+        self.presence_penalty = presence_penalty
+        self.frequency_penalty = frequency_penalty
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        if self.top_k == -1:
+            self.top_k = vocab_size
+        return
+
+
+@dataclass
+class InferBatch:
+    batch_id: int
+    requests: List
+    requests_idx_mapping: Dict[int, int]
+
+    input_ids: torch.Tensor
+
+    all_input_ids: List[List[int]]
+    input_lengths: List[int]
+
+    out_token_id_counts: List
+    sampling_param_list: List[InferSamplingParams]
+
+    nopad_total_token_num: int
+    nopad_max_len_in_batch: int
+    nopad_b_loc: torch.Tensor
+    nopad_b_start_loc: torch.Tensor
+    nopad_b_seq_len: torch.Tensor
+    cache_manager: MemoryManager
+    max_total_len: int
+
+    @classmethod
+    @torch.no_grad()
+    def init_batch(
+        cls,
+        batch_id,
+        requests,
+        dtype: torch.dtype,
+        device: torch.device,
+        cache_manager: MemoryManager,
+        vocab_size: int,
+        max_total_len: int,
+    ) -> "InferBatch":
+        input_lengths = []
+        all_input_ids = []
+        requests_idx_mapping = {}
+
+        out_token_id_counts = []
+        sampling_param_list = []
+
+        nopad_total_token_num = 0
+        nopad_max_len_in_batch = 0
+        nopad_b_loc = torch.empty((len(requests), max_total_len + 12), dtype=torch.long, device="cuda")
+        # to avoid memory leak , we pre-allocate 12 more space for each batch.
+        nopad_b_start_loc = torch.zeros(len(requests), dtype=torch.int32, device="cuda")
+        for i, r in enumerate(requests):
+            # request id -> idx in list mapping
+            requests_idx_mapping[r["request_id"]] = i
+
+            tokenized_input = r["input_id"]
+
+            input_length = len(tokenized_input)
+            input_lengths.append(input_length)
+            all_input_ids.append(tokenized_input)
+            out_token_id_counts.append(collections.defaultdict(int))
+
+            # postprocessor
+            sampling_param = r["sampling_param"]
+            sampling_param["vocab_size"] = vocab_size
+            sampling_param_list.append(InferSamplingParams(**sampling_param))
+
+            nopad_total_token_num += input_length
+            nopad_max_len_in_batch = max(nopad_max_len_in_batch, input_length)
+
+        nopad_b_seq_len = torch.tensor(input_lengths, dtype=torch.int32, device="cuda")
+        nopad_b_start_loc[1:] = torch.cumsum(nopad_b_seq_len, dim=0, dtype=torch.int32)[0:-1]
+
+        if len(requests) > 1:
+            input_ids = np.concatenate(all_input_ids, dtype=np.int64)
+        else:
+            input_ids = all_input_ids[0]
+
+        # Create tensors on device
+        input_ids = torch.tensor(input_ids, dtype=torch.int64, device=device)
+
+        return cls(
+            batch_id=batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            input_lengths=input_lengths,
+            all_input_ids=all_input_ids,
+            nopad_total_token_num=nopad_total_token_num,
+            nopad_max_len_in_batch=nopad_max_len_in_batch,
+            nopad_b_loc=nopad_b_loc,
+            nopad_b_start_loc=nopad_b_start_loc,
+            nopad_b_seq_len=nopad_b_seq_len,
+            out_token_id_counts=out_token_id_counts,
+            sampling_param_list=sampling_param_list,
+            cache_manager=cache_manager,
+            max_total_len=max_total_len,
+        )
+
+    @torch.no_grad()
+    def free_self(self) -> None:
+        """
+        Free the memory of the InferBatch itself
+        """
+        remove_index = []
+        for idx in range(len(self)):
+            remove_index.append(
+                self.nopad_b_loc[
+                    idx,
+                    (self.nopad_max_len_in_batch - 1)
+                    - (self.nopad_b_seq_len[idx] - 1) : (self.nopad_max_len_in_batch - 1),
+                ]
+            )
+        remove_index = torch.cat(remove_index, dim=-1)
+        self.cache_manager.free(remove_index)
+
+    @torch.no_grad()
+    def filter(self, request_ids: List[int]) -> "InferBatch":
+        """
+        Filter finished batch and return a new InferBatch with left ones.
+        """
+        if len(request_ids) == 0:
+            raise ValueError("Batch must have at least one request")
+        if len(request_ids) == len(self):
+            return self
+        requests_idx_mapping = {}
+        indices = []
+        requests = []
+        all_input_ids = []
+        input_lengths = []
+        nopad_total_token_num = 0
+        nopad_max_len_in_batch = 0
+        nopad_b_loc = torch.empty((len(request_ids), self.max_total_len + 12), dtype=torch.long, device="cuda")
+        nopad_b_start_loc = torch.zeros(len(request_ids), dtype=torch.int32, device="cuda")
+        nopad_b_seq_len = torch.zeros(len(request_ids), dtype=torch.int32, device="cuda")
+
+        left_idx = []
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            left_idx.append(idx)
+
+        left_idx_set = set(left_idx)
+        remove_index = []
+        for idx in range(len(self)):
+            if idx not in left_idx_set:
+                remove_index.append(
+                    self.nopad_b_loc[
+                        idx,
+                        (self.nopad_max_len_in_batch - 1)
+                        - (self.nopad_b_seq_len[idx] - 1) : (self.nopad_max_len_in_batch - 1),
+                    ]
+                )
+        remove_index = torch.cat(remove_index, dim=-1)
+        self.cache_manager.free(remove_index)
+
+        nopad_max_len_in_batch = 0
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            indices.append(idx)
+
+        nopad_b_seq_len[:] = self.nopad_b_seq_len[indices]
+        nopad_max_len_in_batch = torch.max(nopad_b_seq_len).item()
+        nopad_b_start_loc[1:] = torch.cumsum(nopad_b_seq_len, dim=0, dtype=torch.int32)[0:-1]
+        nopad_total_token_num = torch.sum(nopad_b_seq_len).item()
+
+        nopad_b_loc[:, 0 : (nopad_max_len_in_batch - 1)] = self.nopad_b_loc[
+            indices,
+            (self.nopad_max_len_in_batch - 1) - (nopad_max_len_in_batch - 1) : (self.nopad_max_len_in_batch - 1),
+        ]
+        for i, request_id in enumerate(request_ids):
+            idx = self.requests_idx_mapping[request_id]
+            requests_idx_mapping[request_id] = i
+            requests.append(self.requests[idx])
+            all_input_ids.append(self.all_input_ids[idx])
+            input_lengths.append(self.input_lengths[idx])
+
+        input_ids = self.input_ids[indices]
+
+        return InferBatch(
+            batch_id=self.batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            input_lengths=input_lengths,
+            all_input_ids=all_input_ids,
+            nopad_total_token_num=nopad_total_token_num,
+            nopad_max_len_in_batch=nopad_max_len_in_batch,
+            nopad_b_loc=nopad_b_loc,
+            nopad_b_start_loc=nopad_b_start_loc,
+            nopad_b_seq_len=nopad_b_seq_len,
+            out_token_id_counts=[self.out_token_id_counts[_i] for _i in indices],
+            sampling_param_list=[self.sampling_param_list[_i] for _i in indices],
+            cache_manager=self.cache_manager,
+            max_total_len=self.max_total_len,
+        )
+
+    @classmethod
+    @torch.no_grad()
+    def merge(cls, batch1, batch2) -> "InferBatch":
+        """
+        Return megerd new InferBatch
+        """
+        requests = batch1.requests + batch2.requests
+        requests_idx_mapping = {}
+        new_batch_size = len(batch1) + len(batch2)
+
+        input_ids = batch1.input_ids.new_empty(new_batch_size)
+        all_input_ids = []
+        input_lengths = []
+        out_token_id_counts = []
+        sampling_param_list = []
+
+        cumulative_batch_size = 0
+        nopad_total_token_num = batch1.nopad_total_token_num + batch2.nopad_total_token_num
+        nopad_max_len_in_batch = max(batch1.nopad_max_len_in_batch, batch2.nopad_max_len_in_batch)
+        max_total_len = max(batch1.max_total_len, batch2.max_total_len)
+        nopad_b_loc = torch.empty((new_batch_size, batch1.max_total_len + 12), dtype=torch.long, device="cuda")
+        nopad_b_start_loc = torch.zeros(new_batch_size, dtype=torch.int32, device="cuda")
+        nopad_b_seq_len = torch.zeros(new_batch_size, dtype=torch.int32, device="cuda")
+        nopad_start_loc_len_temp = 0
+        batches = [batch1, batch2]
+        for i, batch in enumerate(batches):
+            if i == 0:
+                requests_idx_mapping = batch.requests_idx_mapping
+            else:
+                for k, v in batch.requests_idx_mapping.items():
+                    requests_idx_mapping[k] = v + cumulative_batch_size
+            start_index = cumulative_batch_size
+            end_index = cumulative_batch_size + len(batch)
+            input_ids[start_index:end_index] = batch.input_ids
+            nopad_b_seq_len[start_index:end_index] = batch.nopad_b_seq_len
+            nopad_b_start_loc[start_index:end_index] = batch.nopad_b_start_loc + nopad_start_loc_len_temp
+            nopad_start_loc_len_temp = nopad_b_start_loc[end_index - 1] + nopad_b_seq_len[end_index - 1]
+            nopad_b_loc[
+                start_index:end_index,
+                nopad_max_len_in_batch - batch.nopad_max_len_in_batch : nopad_max_len_in_batch - 1,
+            ] = batch.nopad_b_loc[:, : batch.nopad_max_len_in_batch - 1]
+
+            all_input_ids.extend(batch.all_input_ids)
+
+            input_lengths.extend(batch.input_lengths)
+            out_token_id_counts.extend(batch.out_token_id_counts)
+            sampling_param_list.extend(batch.sampling_param_list)
+            # Update
+            cumulative_batch_size += len(batch)
+
+        nopad_b_loc[:, nopad_max_len_in_batch - 1] = (
+            nopad_total_token_num - new_batch_size + torch.arange(0, new_batch_size, dtype=torch.int32, device="cuda")
+        )
+        return InferBatch(
+            batch_id=batches[0].batch_id,
+            requests=requests,
+            requests_idx_mapping=requests_idx_mapping,
+            input_ids=input_ids,
+            input_lengths=input_lengths,
+            all_input_ids=all_input_ids,
+            nopad_total_token_num=nopad_total_token_num,
+            nopad_max_len_in_batch=nopad_max_len_in_batch,
+            nopad_b_loc=nopad_b_loc,
+            nopad_b_start_loc=nopad_b_start_loc,
+            nopad_b_seq_len=nopad_b_seq_len,
+            out_token_id_counts=out_token_id_counts,
+            sampling_param_list=sampling_param_list,
+            cache_manager=batches[0].cache_manager,
+            max_total_len=max_total_len,
+        )
+
+    def __len__(self):
+        return len(self.requests)
+
+    def get_post_sample_tensors(self) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+        presence_penalties: List[float] = []
+        frequency_penalties: List[float] = []
+        temperatures: List[float] = []
+        top_ps: List[float] = []
+        top_ks: List[int] = []
+        p_token_ids: List[int] = []
+        p_token_counts: List[int] = []
+        p_seq_len: List[int] = [
+            0,
+        ]
+        p_max_len_in_batch: int = 0
+        for i, id_to_count in enumerate(self.out_token_id_counts):
+            sample_param = self.sampling_param_list[i]
+            presence_penalties.append(sample_param.presence_penalty)
+            frequency_penalties.append(sample_param.frequency_penalty)
+            temperatures.append(sample_param.temperature)
+            top_ps.append(sample_param.top_p)
+            top_ks.append(sample_param.top_k)
+
+            for token_id, count in id_to_count.items():
+                p_token_ids.append(token_id)
+                p_token_counts.append(count)
+            p_seq_len.append(len(id_to_count))
+            p_max_len_in_batch = max(p_max_len_in_batch, len(id_to_count))
+
+        presence_penalties = torch.tensor(presence_penalties, dtype=torch.float, device="cuda")
+        frequency_penalties = torch.tensor(frequency_penalties, dtype=torch.float, device="cuda")
+        temperatures = torch.tensor(temperatures, dtype=torch.float, device="cuda")
+        top_ps = torch.tensor(top_ps, dtype=torch.float, device="cuda")
+        top_ks = torch.tensor(top_ks, dtype=torch.int32, device="cuda")
+        p_token_ids = torch.tensor(p_token_ids, dtype=torch.int32, device="cuda")
+        p_token_counts = torch.tensor(p_token_counts, dtype=torch.int32, device="cuda")
+        p_seq_len = torch.tensor(p_seq_len, dtype=torch.int32, device="cuda")
+        p_cumsum_seq_len = torch.cumsum(p_seq_len, dim=0, dtype=torch.int32)
+        return (
+            presence_penalties,
+            frequency_penalties,
+            temperatures,
+            top_ps,
+            top_ks,
+            p_token_ids,
+            p_token_counts,
+            p_cumsum_seq_len,
+            p_max_len_in_batch,
+        )
--- a/colossalai/legacy/inference/dynamic_batching/io_struct.py
+++ b/colossalai/legacy/inference/dynamic_batching/io_struct.py
@@ -0,0 +1,166 @@
+# Adapted from https://github.com/ModelTC/lightllm
+
+from typing import Dict, List, Tuple
+
+from .sampling_params import SamplingParams
+
+
+class Req:
+    def __init__(self, request_id, prompt_ids, sample_params: SamplingParams, prompts: str = ""):
+        self.request_id = request_id
+        self.prompt_ids = prompt_ids
+        self.input_len = len(prompt_ids)
+        self.max_output_len = sample_params.max_new_tokens
+        self.sample_params = sample_params
+        self.output_ids = []
+        self.output_metadata_list = []
+        self.has_generate_finished = False
+        self.aborted = False
+        self.prompts = prompts
+
+    def to_rpc_obj(self):
+        return {
+            "request_id": self.request_id,
+            "input_id": self.prompt_ids,
+            "output_len": self.max_output_len,
+            "sampling_param": self.sample_params.to_dict(),
+        }
+
+    def stop_sequences_matched(self):
+        # should we add stpp sequences to the sample params?
+        if self.sample_params.stop_sequences is not None:
+            for stop_token_ids in self.sample_params.stop_sequences:
+                stop_len = len(stop_token_ids)
+                if (
+                    stop_len > 0
+                    and len(self.output_ids) >= stop_len
+                    and all(self.output_ids[-(stop_len - i)] == stop_token_ids[i] for i in range(stop_len))
+                ):
+                    return True
+        return False
+
+    def __repr__(self):
+        return f"request_id(n={self.request_id}, " f"prompt_ids={self.prompt_ids}, "
+
+
+class Batch:
+    def __init__(self, batch_id, reqs: List[Req]):
+        self.batch_id = batch_id
+        self.reqs = reqs
+        self.id_to_reqs = {req.request_id: req for req in reqs}
+
+    def input_tokens(self):
+        batch_input_tokens = 0
+        for req in self.reqs:
+            batch_input_tokens += req.input_len
+        return batch_input_tokens
+
+    def calcu_max_tokens(self):
+        tokens = 0
+        for req in self.reqs:
+            tokens += req.input_len + req.max_output_len
+        return tokens
+
+    def calcu_used_tokens(self):
+        tokens = 0
+        for req in self.reqs:
+            tokens += req.input_len + len(req.output_ids)
+        return tokens
+
+    def mark_finished_req(self, eos_id, engine_max_output_len):
+        has_new_finish = False
+        for req in self.reqs:
+            if req.stop_sequences_matched():
+                req.has_generate_finished = True
+                has_new_finish = True
+            if len(req.output_ids) >= engine_max_output_len:
+                req.has_generate_finished = True
+                has_new_finish = True
+            if req.output_ids[-1] == eos_id and req.sample_params.ignore_eos == False:
+                req.has_generate_finished = True
+                has_new_finish = True
+            if len(req.output_ids) >= req.max_output_len or req.aborted:
+                req.has_generate_finished = True
+                has_new_finish = True
+        return has_new_finish
+
+    def filter_finished(self) -> List[Req]:
+        """
+        Filter finished requests from the batch, the finished ones will be removed from 'reqs'.
+        """
+        # TODO: the logic of return should be defined here.
+        unfinished_req = []
+        finished_req = []
+        for req in self.reqs:
+            if not req.has_generate_finished:
+                unfinished_req.append(req)
+            else:
+                finished_req.append(req)
+        self.reqs = unfinished_req
+        self.id_to_reqs = {req.request_id: req for req in self.reqs}
+        return finished_req
+
+    def is_clear(self):
+        return len(self.reqs) == 0
+
+    def merge(self, mini_batch):
+        for _req in mini_batch.reqs:
+            self.reqs.append(_req)
+        self.id_to_reqs = {req.request_id: req for req in self.reqs}
+        return
+
+    def __repr__(self):
+        return f"batch_id={self.batch_id}, " f"reqs={self.reqs}, "
+
+    def __len__(self):
+        return len(self.reqs)
+
+
+class BatchTokenIdOut:
+    def __init__(self):
+        self.reqs_infs: List[
+            Tuple[str, int, Dict, bool, bool]
+        ] = []  # [req_id, new_token_id, gen_metadata, finished_state, abort_state]
+
+
+class BatchStrOut:
+    def __init__(self):
+        self.reqs_infs: List[
+            Tuple[str, str, Dict, bool, bool]
+        ] = []  # [req_id, token_str, gen_metadata, finished_state, abort_state]
+
+
+class AbortReq:
+    def __init__(self, req_id):
+        self.req_id = req_id
+
+
+class RequestOutput:
+    """The output data of a request to the LLM.
+
+    Args:
+        request_id: The unique ID of the request.
+        prompt: The prompt string of the request.
+        prompt_token_ids: The token IDs of the prompt.
+        outputs: The output sequences of the request.
+    """
+
+    def __init__(
+        self,
+        request_id: str,
+        prompt: str,
+        prompt_token_ids: List[int],
+        outputs,
+    ) -> None:
+        self.request_id = request_id
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
+        self.outputs = outputs
+
+    def __repr__(self) -> str:
+        return (
+            f"RequestOutput(request_id={self.request_id}, "
+            f"prompt={self.prompt!r}, "
+            f"prompt_token_ids={self.prompt_token_ids}, "
+            f"outputs={self.outputs}, "
+        )
--- a/colossalai/legacy/inference/dynamic_batching/ray_dist_init.py
+++ b/colossalai/legacy/inference/dynamic_batching/ray_dist_init.py
@@ -0,0 +1,154 @@
+import logging
+import os
+from typing import List
+
+import ray
+import ray.util.collective as collective
+import torch
+from transformers import AutoModelForCausalLM
+
+import colossalai
+from colossalai.inference.async_manager import start_dynamic_batching
+from colossalai.inference.dynamic_batching.get_tokenizer import get_tokenizer
+from colossalai.inference.dynamic_batching.io_struct import RequestOutput
+from colossalai.inference.dynamic_batching.ray_init_config import EngineArgsClass, RooterArgsClass
+from colossalai.inference.dynamic_batching.sampling_params import SamplingParams
+from colossalai.inference.tensor_parallel.engine import TPInferEngine
+from colossalai.shardformer import ShardConfig
+from colossalai.testing import free_port
+
+ray_serve_logger = logging.getLogger("ray.serve")
+
+
+def log_cuda_info(scope_name: str):
+    ray_serve_logger.info(f" {scope_name}: ray.get_gpu_ids(): {ray.get_gpu_ids()}")
+    ray_serve_logger.info(
+        f" {scope_name}: CUDA_VISIBLE_DEVICES: {os.getenv('CUDA_VISIBLE_DEVICES', 'NO DEVICES FOUND!')}"
+    )
+    if torch.cuda.is_available():
+        ray_serve_logger.info(
+            f" {scope_name}: cuda current_device: {torch.cuda.current_device()}, cuda device count: {torch.cuda.device_count()}"
+        )
+    else:
+        ray_serve_logger.info(f" {scope_name}: cuda is not available!")
+
+
+@ray.remote(num_gpus=1)
+class Worker:
+    def __init__(
+        self,
+        model_path: str,
+        tensor_parallel_size: int,
+        max_batch_size: int,
+        max_input_len: int,
+        max_output_len: int,
+        router_config: RooterArgsClass,
+    ):
+        log_cuda_info("Worker.init")
+        self.tensor_parallel_size = tensor_parallel_size
+        self.model_path = model_path
+        self.max_batch_size = max_batch_size
+        self.max_input_len = max_input_len
+        self.max_output_len = max_output_len
+        self.router_config = router_config
+
+    def setup(self, world_size, rank, port):
+        # initialize a ray collective group, otherwise colossalai distributed env won't be built successfully
+        collective.init_collective_group(world_size, rank, "nccl", "default")
+        # initialize and set distributed environment
+        colossalai.launch(config={}, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+        ray_serve_logger.info(f"Worker with rank {rank} (world size {world_size}) setting up..")
+        log_cuda_info("Worker.setup")
+
+        # Load model
+        self.tokenizer = get_tokenizer(tokenizer_name=self.model_path)
+        if self.tokenizer.pad_token is None:
+            self.tokenizer.pad_token = self.tokenizer.eos_token
+        self.model = AutoModelForCausalLM.from_pretrained(
+            self.model_path, pad_token_id=self.tokenizer.pad_token_id, torch_dtype=torch.float16
+        )
+        shard_config = ShardConfig(
+            enable_tensor_parallelism=True if world_size > 1 else False, extra_kwargs={"inference_only": True}
+        )
+        self.infer_engine = TPInferEngine(
+            self.model, shard_config, self.max_batch_size, self.max_input_len, self.max_output_len
+        )
+        self.start_dynamic_batching = start_dynamic_batching(self.router_config, self.infer_engine, [])
+
+        return True
+
+    # def generate(self, request_id: str, prompt: str, sampling_params: SamplingParams) -> List[str]:
+    #     ray_serve_logger.info(f"text: {prompt}")
+
+    #     final_outputs = self.start_dynamic_batching.generate(prompt, sampling_params, request_id)
+
+    #     return final_outputs
+
+    def add_input(self, request_id: str, prompt: str, sampling_params: SamplingParams):
+        self.start_dynamic_batching.add_input(request_id, prompt, sampling_params)
+
+    def abort(self, request_id: str):
+        self.start_dynamic_batching.abort(request_id)
+
+    def step(self) -> List[RequestOutput]:
+        return self.start_dynamic_batching._step()
+
+    def add_req(self, prompt_ids: List[int], sampling_params: SamplingParams, request_id: str, prompt: str):
+        self.start_dynamic_batching.add_req(prompt_ids, sampling_params, request_id, prompt)
+
+    def is_running(self):
+        return self.start_dynamic_batching.is_running()
+
+
+class Driver:
+    def __init__(self, router_config: RooterArgsClass, engine_config: EngineArgsClass):
+        log_cuda_info("Driver:init")
+        model_path = engine_config.model
+        tensor_parallel_size = engine_config.tensor_parallel_size
+
+        self.num_workers = tensor_parallel_size
+        self.workers = []
+        init_rets = []
+
+        # Just grab a free port on localhost
+        # NOTE workers in this communication group listen to the same port
+        available_port = free_port()
+
+        for i in range(self.num_workers):
+            worker_name = "worker_idx_{}".format(i)
+            w = Worker.options(name=worker_name).remote(
+                model_path,
+                self.num_workers,
+                engine_config.max_batch_size,
+                engine_config.max_input_len,
+                engine_config.max_output_len,
+                router_config,
+            )
+            self.workers.append(w)
+            init_rets.append(w.setup.remote(self.num_workers, i, available_port))
+        _options = {
+            "group_name": "default_driver",
+            "world_size": self.num_workers,
+            "ranks": [i for i in range(self.num_workers)],
+            "backend": "nccl",
+        }
+        collective.create_collective_group(self.workers, **_options)
+        _ = ray.get(init_rets)
+
+    def add_input(self, request_id: str, prompt: str, sampling_params: SamplingParams):
+        ray.get([w.add_input.remote(request_id, prompt, sampling_params) for w in self.workers])
+
+    def abort(self, request_id: str):
+        ray.get([w.abort.remote(request_id) for w in self.workers])
+
+    def step(self):
+        results = ray.get([w.step.remote() for w in self.workers])
+        outputs = results[0]  # get any one of the copies
+        return outputs
+
+    def add_req(self, request_id: str, prompt_ids: List[int], sampling_params: SamplingParams, prompt: str):
+        ray.get([w.add_req.remote(prompt_ids, sampling_params, request_id, prompt) for w in self.workers])
+
+    def is_running(self):
+        results = ray.get([w.is_running.remote() for w in self.workers])
+        return any(results)
--- a/colossalai/legacy/inference/dynamic_batching/ray_init_config.py
+++ b/colossalai/legacy/inference/dynamic_batching/ray_init_config.py
@@ -0,0 +1,58 @@
+import logging
+
+import yaml
+from pydantic import BaseModel
+
+logger = logging.getLogger(__name__)
+
+
+class EngineArgsClass(BaseModel):
+    """Config for Engine"""
+
+    model: str
+    tensor_parallel_size: int = 2
+    max_batch_size: int = 4
+    max_input_len: int = 128
+    max_output_len: int = 32
+
+
+class RooterArgsClass(BaseModel):
+    """Config for Rooter"""
+
+    max_total_token_num: int = 42
+    batch_max_tokens: int = 42
+    eos_id: int = 0
+    disable_log_stats: bool = False
+    log_stats_interval: int = 10
+    model: str
+
+
+class RayInitConfig(BaseModel):
+    """All-together configs without app router config"""
+
+    engine_config_data: EngineArgsClass
+    router_config_data: RooterArgsClass
+
+    @classmethod
+    def from_yaml_path(cls, path: str):
+        try:
+            with open(path, "r") as yaml_file:
+                try:
+                    config = yaml.safe_load(yaml_file)
+                    # serve deployment config
+                    engine_config = config.get("engine_config", {})
+                    router_config = config.get("router_config", {})
+
+                    return cls(
+                        engine_config_data=engine_config,
+                        router_config_data=router_config,
+                    )
+                except yaml.YAMLError as e:
+                    logger.error(f"An Error occurred when parsing yaml: {e}")
+                    raise
+        except FileNotFoundError:
+            logger.error(f"The file '{path}' does not exist!")
+            raise
+        except OSError as e:
+            logger.error(f"An Error occurred: {e}")
+            raise
--- a/colossalai/legacy/inference/dynamic_batching/req_queue.py
+++ b/colossalai/legacy/inference/dynamic_batching/req_queue.py
@@ -0,0 +1,73 @@
+# Adapted from https://github.com/ModelTC/lightllm
+
+import uuid
+from typing import List
+
+import numpy as np
+
+from .io_struct import Batch, Req
+
+
+class ReqQueue:
+    def __init__(self, max_total_tokens, batch_max_tokens, running_max_req_size, waiting_req_list=[]) -> None:
+        self.max_total_tokens = max_total_tokens
+        assert batch_max_tokens is not None
+        self.batch_max_tokens = batch_max_tokens
+        self.running_max_req_size = running_max_req_size
+        self.waiting_req_list: List[Req] = waiting_req_list
+
+    def append(self, req):
+        self.waiting_req_list.append(req)
+        return
+
+    def _init_cache_list(self, current_batch: Batch):
+        if current_batch is not None:
+            self.cache_len_list = [
+                (req.input_len + len(req.output_ids), req.max_output_len - len(req.output_ids) - 1)
+                for req in current_batch.reqs
+            ]
+        else:
+            self.cache_len_list = []
+
+    # @calculate_time(show=True, min_cost_ms=0.1)
+    def _can_add_new_req(self, req):
+        self.cache_len_list.append((req.input_len + 1, req.max_output_len - 1))  # hard to analysis
+        self.cache_len_list.sort(key=lambda x: -x[1])
+
+        left_out_len_array = np.array([e[1] for e in self.cache_len_list])
+        # assert left_out_len_array.min() >= 0
+        has_run_len_array = np.array([e[0] for e in self.cache_len_list])
+        cum_run_len_array = np.cumsum(has_run_len_array)
+        size_array = np.arange(1, len(self.cache_len_list) + 1, 1)
+
+        need_max_token_num = (left_out_len_array * size_array + cum_run_len_array).max()
+        # NOTE: change here < to <=
+        return need_max_token_num <= self.max_total_tokens and len(self.cache_len_list) <= self.running_max_req_size
+
+    def generate_new_batch(self, current_batch: Batch = None):
+        if current_batch is not None and len(current_batch.reqs) >= self.running_max_req_size:
+            return None
+        self._init_cache_list(current_batch)
+        can_run_list = []
+        new_batch_total_tokens = 0
+        aborted_count = 0
+        for req in self.waiting_req_list:
+            flag = self._can_add_new_req(req)
+            if req.aborted:
+                aborted_count += 1
+                continue
+            if flag and new_batch_total_tokens + req.input_len <= self.batch_max_tokens:
+                can_run_list.append(req)
+                new_batch_total_tokens += req.input_len
+            else:
+                break
+
+        if len(can_run_list) != 0:
+            new_batch = Batch(uuid.uuid4().hex, can_run_list)
+            self.waiting_req_list = self.waiting_req_list[len(can_run_list) + aborted_count :]
+            return new_batch
+        else:
+            return None
+
+    def __len__(self):
+        return self.waiting_req_list.__len__()
--- a/colossalai/legacy/inference/dynamic_batching/sampling_params.py
+++ b/colossalai/legacy/inference/dynamic_batching/sampling_params.py
@@ -0,0 +1,83 @@
+# Adapted from https://github.com/ModelTC/lightllm
+
+"""Sampling parameters for text generation."""
+from typing import List, Optional, Union
+
+_SAMPLING_EPS = 1e-5
+
+
+class SamplingParams:
+    def __init__(
+        self,
+        do_sample: bool = False,
+        presence_penalty: float = 0.0,
+        frequency_penalty: float = 0.0,
+        temperature: float = 1.0,
+        top_p: float = 1.0,
+        top_k: int = -1,  # -1 is for all
+        ignore_eos: bool = False,
+        max_new_tokens: int = 256,
+        stop_sequences: Optional[Union[str, List[str]]] = None,  # conditions to stop generation
+    ) -> None:
+        self.do_sample = do_sample
+        self.presence_penalty = presence_penalty
+        self.frequency_penalty = frequency_penalty
+        self.temperature = temperature
+        self.top_p = top_p
+        self.top_k = top_k
+        self.ignore_eos = ignore_eos
+        self.max_new_tokens = max_new_tokens
+        self.stop_sequences = stop_sequences
+        if self.do_sample == False:
+            self.temperature = 1.0
+            self.top_p = 1.0
+            self.top_k = 1
+        if (
+            self.temperature >= 0.0 and self.temperature < _SAMPLING_EPS
+        ):  # temperature is too slow, change to greedy search
+            self.temperature = 1.0
+            self.top_k = 1
+        return
+
+    def verify(self):
+        if self.presence_penalty < 0.0:
+            raise ValueError(f"presence_penalty must >= 0.0, got {self.presence_penalty}")
+        if self.frequency_penalty < 0.0:
+            raise ValueError(f"frequency_penalty must >= 0.0, got {self.frequency_penalty}")
+        if self.temperature <= 0.0:
+            raise ValueError(f"temperature must > 0.0, got {self.temperature}")
+        if self.top_p <= 0.0 or self.top_p > 1.0:
+            raise ValueError(f"top_p must in (0.0, 1.0], got {self.top_p}")
+        if self.top_k < -1 or self.top_k == 0:
+            raise ValueError(f"top_k must be -1 (disable), or at least 1, got {self.top_k}.")
+        if self.max_new_tokens < 1:
+            raise ValueError(f"max_new_tokens must be at least 1 , got {self.max_new_tokens}.")
+        return
+
+    def stop_sentences_to_token_ids(self, tokenizer):
+        if self.stop_sequences is None:
+            self.stop_sequences = []
+        else:
+            if isinstance(self.stop_sequences, str):
+                self.stop_sequences = [self.stop_sequences]
+            new_stop_sequences = []
+            for stop_str in self.stop_sequences:
+                stop_str_ids = tokenizer.encode(stop_str)
+                if stop_str_ids is not None and len(stop_str_ids) >= 1:  # remove bos_token_id
+                    stop_str_ids = stop_str_ids[1:]
+                if len(stop_str_ids) > 0:
+                    new_stop_sequences.append(stop_str_ids)
+            self.stop_sequences = new_stop_sequences
+        return
+
+    def to_dict(self):
+        ret = {}
+        ret["do_sample"] = self.do_sample
+        ret["presence_penalty"] = self.presence_penalty
+        ret["frequency_penalty"] = self.frequency_penalty
+        ret["temperature"] = self.temperature
+        ret["top_p"] = self.top_p
+        ret["top_k"] = self.top_k
+        # if self.ignore_eos is not None:
+        #     ret["ignore_eos"] = self.ignore_eos
+        return ret
--- a/colossalai/legacy/inference/dynamic_batching/stats.py
+++ b/colossalai/legacy/inference/dynamic_batching/stats.py
@@ -0,0 +1,45 @@
+# Adapted from https://github.com/ModelTC/lightllm
+
+import time
+
+
+class Stats:
+    def __init__(self, log_status, log_stats_interval) -> None:
+        self.log_stats = log_status
+        self.log_stats_interval = log_stats_interval
+        self.last_log_time = time.time()
+        self.all_tokens = 0
+        self.output_tokens = 0
+        self.prompt_tokens = 0
+        return
+
+    def count_prompt_tokens(self, run_batch):
+        if self.log_stats:
+            tokens = run_batch.input_tokens()
+            self.prompt_tokens += tokens
+            self.all_tokens += tokens
+        return
+
+    def count_output_tokens(self, run_batch):
+        if self.log_stats:
+            tokens = len(run_batch.reqs)
+            self.output_tokens += tokens
+            self.all_tokens += tokens
+        return
+
+    def print_stats(self):
+        if not self.log_stats:
+            return
+
+        now = time.time()
+        if now - self.last_log_time > self.log_stats_interval:
+            print(
+                f"Avg tokens(prompt+generate) throughput: {self.all_tokens/(now-self.last_log_time):8.3f} tokens/s\n"
+                f"Avg prompt tokens throughput:           {self.prompt_tokens/(now-self.last_log_time):8.3f} tokens/s\n"
+                f"Avg generate tokens throughput:         {self.output_tokens/(now-self.last_log_time):8.3f} tokens/s"
+            )
+            self.all_tokens = 0
+            self.output_tokens = 0
+            self.prompt_tokens = 0
+            self.last_log_time = now
+        return