[Inference] add logit processor and request handler (#5166)

* add logit processor and request handler * add * add * add * fix * add search tokens and update func * finish request handler * add running list test * fix test * fix some bug * add * add * fix bugs * fix some bugs * fix bug * fix * fix * add copy fun * del useless attn * fix request status --------- Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
2025-09-12 20:54:35 +00:00 · 2023-12-25 12:15:15 +08:00
parent 8daee26989
commit 0e616462a7
10 changed files with 463 additions and 66 deletions
--- a/colossalai/inference/config.py
+++ b/colossalai/inference/config.py
@@ -1,3 +1,9 @@
+"""
+Our config consists of two parts:
+    1. inference_config: configs for inference, it is a unified api that wraps all the configs for inference.
+    2. generation_config: configs for generation, it is inherited from huggingface.
+"""
+
 import logging
 from dataclasses import dataclass
 from typing import Optional, Union
--- a/colossalai/inference/core/request_handler.py
+++ b/colossalai/inference/core/request_handler.py
@@ -1,71 +1,210 @@
 from typing import List

+import torch
+from transformers.configuration_utils import PretrainedConfig
+
+from colossalai.inference.config import InferenceConfig
+from colossalai.inference.kv_cache import KVCacheManager
+from colossalai.inference.logit_processors import logit_processor
+from colossalai.inference.sampler import *
 from colossalai.inference.struct import BatchInfo, Sequence


+class RunningList:
+    """
+    RunningList is an structure for recording the running sequences, contains prefill and decoding list.
+    Prefilling samples will be hold until the actual ratio of prefill samples versus decoding samples exceeds ratio.
+
+    Args:
+        prefill_ratio: (float) A ratio for determing whether to perform prefill or not.
+        prefill: (List) List that contains default inputs, defaults to [].
+    """
+
+    def __init__(self, prefill_ratio: str, prefill: List[Sequence] = None):
+        self.prefill_ratio = prefill_ratio
+        self.decoding: List[Sequence] = []
+        self.prefill: List[Sequence] = prefill if prefill is not None else []
+
+    def append(self, seq: Sequence):
+        # add seq to prefilling list first.
+        self.prefill.append(seq)
+
+    def find_seq(self, request_id):
+        for seq in self.decoding:
+            if request_id == seq.request_id:
+                return seq
+        for seq in self.prefill:
+            if request_id == seq.request_id:
+                return seq
+        return None
+
+    def remove(self, seq: Sequence):
+        if seq in self.decoding:
+            self.decoding.remove(seq)
+        elif seq in self.prefill:
+            self.prefill.remove(seq)
+        else:
+            raise ValueError(f"sequence {seq.request_id} is not in running list")
+
+    def ready_for_prefill(self):
+        if not self.decoding:
+            return len(self.prefill) > 0
+        return len(self.prefill) / len(self.decoding) >= self.ratio
+
+    def is_empty(self):
+        return not self.decoding and not self.prefill
+
+
 class RequestHandler:
    """
    RequestHandler is the core for handling existing requests and updating current batch.
    During generation process, we call schedule function each iteration to update current batch.

    Args:
-        inference_config: Store the configuration information related to inference.
-        model_config: The huggingface model config.
+       inference_config: Configuration for initialize and manage kv cache.
+       model_config: Configuration for model
    """

-    def __init__(self, inference_config, model_config) -> None:
+    def __init__(self, inference_config: InferenceConfig, model_config: PretrainedConfig) -> None:
        self.inference_config = inference_config
-        self.model_config = model_config
-        self._init_cache()
-        self.waiting_list: List["Sequence"] = []
-        self.running_list: List["Sequence"] = []
-        self.batch = BatchInfo.init_batch()
+        self._init_cache(model_config)

-    def _init_cache(self):
-        """
-        Initialize the cache manager with cache config.
-        """
+        self.running_list: RunningList = RunningList(inference_config.prefill_ratio)
+        self.waiting_list: List[List] = [[], [], []]
+        self.done_list: List[Sequence] = []
+        self.running_batch = BatchInfo(is_prompts=False)
+        self.prefill_batch = BatchInfo(is_prompts=True)
+
+    def _init_cache(self, model_config):
+        self.cache_manager = KVCacheManager(self.inference_config, model_config)
+
+    def _has_waiting(self) -> bool:
+        return any(lst for lst in self.waiting_list)

    def schedule(self):
        """
        The main logic of request handler.
        """
-        # The code below is only used for testing engine and will be modified.
-        if self.waiting_list:
-            self.running_list = self.waiting_list
-        self.batch.add_seqs(self.running_list)
-        return self.batch
+        if self._has_waiting():
+            # Try to allocate cache blocks for the sequence using a priority of prompt length.
+            for lst in reversed(self.waiting_list):
+                if lst:
+                    for seq in lst:
+                        if seq.prompt_len > self.inference_config.max_input_len:
+                            # If the prompt length is longer than max_input_len, abort the sequence.
+                            self.abort_sequence(seq.request_id)
+                            break
+                        # Try to allocate cache blocks for the sequence.
+                        if self.cache_manager.check_allocation(seq):
+                            # If succeed, add the sequence to running list.
+                            self.running_list.append(seq)
+                            self.cache_manager.allocate_context_from_block_table(seq.block_table, seq.prompt_len)
+                            lst.remove(seq)

-    def add_sequence(self, req_seq: "Sequence"):
+        if self.running_list.ready_for_prefill():
+            for seq in self.running_list.prefill:
+                seq.mark_running()
+            self.prefill_batch.init_batch(self.running_list.prefill)
+            return self.prefill_batch
+
+        return self.running_batch
+
+    def add_sequence(self, req: Sequence):
        """
        Add the request to waiting list.
        """
-        self.waiting_list.append(req_seq)
+        assert not self._find_sequence(req.request_id), f"Sequence {req.request_id} already exists."
+        assert (
+            req.prompt_len < self.inference_config.max_input_len
+        ), f"Sequence {req.request_id} exceeds input length limit"

-    def abort_sequence(self, seq_id: str):
-        """
-        Abort the request. #TODO :implement this
-        """
-        self._find_sequence(seq_id)
-        return
+        self.waiting_list[req.prompt_len * 3 // self.inference_config.max_input_len].append(req)

-    def _find_sequence(self, seq_id: str) -> "Sequence":
+    def abort_sequence(self, request_id: str):
        """
-        Find the request by seq_id.
+        Abort the request.
        """
+        seq, priority = self._find_sequence(request_id)
+        if seq.status.is_waiting:
+            seq.mark_aborted()
+            self.waiting_list[priority].remove(seq)
+        elif seq.status.is_running():
+            self.cache_manager.free_block_table(seq.block_table)
+            self.running_list.remove(seq)
+        else:
+            try:
+                self.done_list.remove(seq)
+            except:
+                return
+
+    def _find_sequence(self, request_id: str) -> Sequence:
+        """
+        Find the request by request_id.
+        """
+        for priority, lst in enumerate(self.waiting_list):
+            for seq in lst:
+                if seq.request_id == request_id:
+                    return seq, priority
+
+        if self.running_list.find_seq(request_id):
+            return seq, None
+
+        return None
+
+    def _sample(self, probs: torch.Tensor, logprobs: torch.Tensor, generation_config):
+        if generation_config.num_beams == 1:
+            if generation_config.do_sample:
+                sample_tokens = greedy_sample(generation_config, logprobs)
+            else:
+                sample_tokens = multinomial_sample(generation_config, probs)
+        else:
+            sample_tokens = beam_search_sample(generation_config, logprobs, is_prompt=not self.prefill_batch.is_empty)
+
+        return sample_tokens
+
+    def mark_finished(self, sequence: Sequence, generation_config):
+        if (
+            sequence.output_token_id[-1] == generation_config.eos_id
+            or sequence.output_len >= generation_config.max_output_len
+        ):
+            sequence.mark_finished()

    def check_unfinished_seqs(self) -> bool:
-        return len(self.waiting_list) != 0 or len(self.running_list) != 0
+        return self._has_waiting() or not self.running_list.is_empty()
+
+    def search_tokens(self, generation_config, logits):
+        """
+        Sample tokens for finished requests.
+        """
+        # do logit processor
+        # NOTE: need to decide the granularity to process logits (sequence or batch)
+        for type in ["top_p", "top_k", "min_p"]:
+            if type in generation_config:
+                logits = logit_processor(type, logits)
+
+        # calculate probs
+        probs = torch.softmax(logits, dim=-1, dtype=torch.float)
+        logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
+
+        # sample the next tokens
+        sample_tokens = self._sample(probs, logprobs, generation_config)
+        self.running_batch.update_batch_tokens(sample_tokens)

    def update(self):
        """
-        Update the waiting list and running list.
+        Update current running list and done list
        """
+        if not self.prefill_batch.is_empty:
+            self.running_list.decoding.extend(self.running_list.prefill)
+            self.running_batch.add_seqs(self.running_list.prefill)
+            self.running_list.prefill.clear()
+            self.prefill_batch.clear_batch()

-        # The code below is only used for testing engine and will be modified.
-        self.waiting_list = []
-        self.running_list = []
-        finished_sequences = list(self.batch.sequences_set)
+        for seq in self.running_batch.sequences_set:
+            if seq.check_finish():
+                self.done_list.append(seq)
+                self.running_list.remove(seq)
+                self.running_batch.sequences_set.remove(seq)
+                self.cache_manager.free_block_table(seq.block_table)

-        self.batch.clear_batch()
-        return finished_sequences
+        return self.done_list
--- a/colossalai/inference/kv_cache/kvcache_manager.py
+++ b/colossalai/inference/kv_cache/kvcache_manager.py
@@ -4,6 +4,7 @@ import torch
 from transformers.configuration_utils import PretrainedConfig

 from colossalai.inference.config import InferenceConfig
+from colossalai.inference.struct import Sequence
 from colossalai.logging import get_dist_logger
 from colossalai.utils import get_current_device

@@ -99,11 +100,13 @@ class KVCacheManager:
        self._block_states_cum = torch.zeros(size=(self.num_blocks + 1,), dtype=torch.int64)
        self._block_finder = torch.zeros((self.num_blocks,), dtype=torch.int64)

-    def get_total_num_blocks(self) -> int:
+    @property
+    def total_num_blocks(self) -> int:
        """Get the total number of logical cache blocks."""
        return self.num_blocks

-    def get_num_available_blocks(self) -> int:
+    @property
+    def num_available_blocks(self) -> int:
        """Get the number of available cache blocks."""
        return self._available_blocks

@@ -114,6 +117,10 @@ class KVCacheManager:
        #      in the current batch.
        return self.max_blocks_per_sequence

+    def check_allocation(self, seq: Sequence) -> bool:
+        num_blocks_needed = (seq.prompt_len + self.max_output_length + self.block_size - 1) // self.block_size
+        return num_blocks_needed <= self.num_available_blocks
+
    def get_block_kv_ptrs(self, block_id: int, layer_id: int) -> Tuple[List[int], List[int]]:
        """Get the key and value pointers of physical caches (of specific layer) corresponding to a logical cache block."""
        block: CacheBlock = self._cache_blocks[block_id]
--- a/colossalai/inference/logit_processors.py
+++ b/colossalai/inference/logit_processors.py
@@ -0,0 +1,66 @@
+import torch
+import torch.nn.functional as F
+
+_LOGIT_PROCESSOR_MAP = {}
+
+
+def register_logit_processor(process_type):
+    """
+    register flops computation function for operation.
+    """
+
+    def register(func):
+        global _LOGIT_PROCESSOR_MAP
+        _LOGIT_PROCESSOR_MAP[process_type] = func
+        return func
+
+    return register
+
+
+@register_logit_processor("top_k")
+def top_k_logit_processor(logits, top_k: int):
+    """
+    top_k logit processor
+    """
+    indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
+    logits[indices_to_remove] = -float("inf")
+    return logits
+
+
+@register_logit_processor("top_p")
+def top_p_logit_processor(logits, top_p: float):
+    """
+    top_p logit processor
+    """
+    sorted_logits, sorted_indices = torch.sort(logits, descending=True)
+    cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+    sorted_indices_to_remove = cumulative_probs > top_p
+    sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+    sorted_indices_to_remove[..., 0] = 0
+
+    indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
+    logits[indices_to_remove] = -float("inf")
+    return logits
+
+def logit_processor(processor:str, logits , attrs):
+    """
+    do logit process for given logits.
+
+    Args:
+        processor(str): the type of logit processor 
+        logits(torch.Tensor): input logits
+        attrs(dict): attrs of the logit processor 
+
+    Returns:
+        logits after process
+    """
+    if processor not in _LOGIT_PROCESSOR_MAP:
+        return logits
+    else:
+        func = _LOGIT_PROCESSOR_MAP[processor]
+        try:
+            logits = func(logits, attrs)
+        except Exception as e:
+            return logits
+        return logits
--- a/colossalai/inference/sampler.py
+++ b/colossalai/inference/sampler.py
@@ -0,0 +1,62 @@
+from typing import List, Tuple
+
+import torch
+
+
+def greedy_sample(
+    generation_config,
+    logprobs: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Sample tokens greedyly.
+    """
+    results = torch.argmax(logprobs, dim=-1).cpu()
+    return results
+
+
+def multinomial_sample(
+    generation_config,
+    probs: torch.Tensor,
+) -> torch.Tensor:
+    """
+    Sample tokens in a random phase.
+    """
+    max_best_of = generation_config.best_of
+    random_results = torch.multinomial(probs, num_samples=max_best_of, replacement=True).cpu()
+    return random_results
+
+
+def beam_search_sample(
+    generation_config,
+    logprobs: torch.Tensor,
+    is_prompt: bool = False,
+) -> List[Tuple[List[int], List[int]]]:
+    """
+    Sample tokens with beam search.
+    We sample 2 * beam_width candidates to make sure that with high probability we can get `beam_width` candidates in addition to
+    the finished sequences for the next iteration.
+
+    ref:
+        https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563
+    for details. See also HF reference:
+        https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065
+
+    # NOTE: this beam search sample function is wrong now.
+    """
+
+    beam_width = generation_config.best_of
+    results = []
+    if is_prompt:
+        # Prompt phase.
+        parent_ids = [0] * (2 * beam_width)
+        _, next_token_ids = torch.topk(logprobs[0], 2 * beam_width)
+        next_token_ids = next_token_ids.tolist()
+    else:
+        # Generation phase.
+        # cumulative_logprobs = [seq_data[seq_id].cumulative_logprob for seq_id in seq_ids]
+        cumulative_logprobs = torch.tensor(logprobs, dtype=torch.float, device=seq_group_logprobs.device)
+        seq_group_logprobs = seq_group_logprobs + cumulative_logprobs.unsqueeze(dim=1)
+        _, topk_ids = torch.topk(logprobs.flatten(), 2 * beam_width)
+
+    results.append((next_token_ids, parent_ids))
+    return results
--- a/colossalai/inference/struct.py
+++ b/colossalai/inference/struct.py
@@ -1,6 +1,6 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Union
+from typing import Any, List, Union

 import torch
 from ordered_set import OrderedSet
@@ -21,8 +21,7 @@ class RequestStatus(enum.Enum):

    # running status
    WAITING = enum.auto()
-    PREFILL = enum.auto()
-    TOKEN = enum.auto()
+    RUNNING = enum.auto()
    ABORTED = enum.auto()

    # completion status
@@ -40,10 +39,7 @@ class RequestStatus(enum.Enum):

    @staticmethod
    def is_running(status: "RequestStatus") -> bool:
-        return status in [
-            RequestStatus.PREFILL,
-            RequestStatus.TOKEN,
-        ]
+        return status == RequestStatus.RUNNING

    @staticmethod
    def is_waiting(status: "RequestStatus") -> bool:
@@ -69,7 +65,7 @@ class Sequence:
    prompt: str
    input_token_id: List[int]
    block_size: int
-    sample_params: any  # SampleParams needs to be imported later.
+    sample_params: Any  # SampleParams needs to be imported later.
    block_table: torch.Tensor
    eos_token_id: int
    max_output_len: int = 256
@@ -78,21 +74,31 @@ class Sequence:
        self.output_token_id = []
        self.status = RequestStatus.WAITING

-    def get_sentence_len(self) -> None:
+    @property
+    def prompt_len(self) -> int:
+        """
+        Get length of prompts
+        """
+        return len(self.input_token_id)
+
+    @property
+    def sentence_len(self) -> int:
        """
        Get length of current sentence.
        """
        return len(self.input_token_id) + len(self.output_token_id)

-    def get_input_len(self) -> None:
+    @property
+    def input_len(self) -> int:
        """
        Get length of input sentence.
        """
        return len(self.input_token_id)

-    def get_output_len(self) -> None:
+    @property
+    def output_len(self) -> int:
        """
-        Get output length of current sentence.
+        Get length of output sentence.
        """
        return len(self.output_token_id)

@@ -116,12 +122,32 @@ class Sequence:
    def __hash__(self):
        return hash(self.request_id)

+    def mark_running(self) -> None:
+        """
+        Set status for prefill reqs.
+        """
+        assert self.status == RequestStatus.WAITING, "Sequence is not in WAITTING STATUS"
+        self.status = RequestStatus.RUNNING
+
+    def mark_finished(self) -> None:
+        """
+        Set status for finished reqs.
+        """
+        self.status = RequestStatus.COMPLETED
+
+    def mark_aborted(self) -> None:
+        """
+        Set status for aborted reqs.
+        """
+        self.status = RequestStatus.ABORTED
+
    def __repr__(self) -> str:
        return (
            f"Request ID(request_id={self.request_id}, "
            f"prompt={self.prompt}, "
            f"status={self.status.name}, "
-            f"sample_params={self.sample_params}"
+            f"sample_params={self.sample_params}, "
+            f"logical block number={len(self.block_table_index)}"
        )


@@ -131,7 +157,8 @@ class BatchInfo:
    Information to be passed and used for a batch of sequences.
    """

-    sequences_set: OrderedSet["Sequence"]
+    sequences_set: OrderedSet["Sequence"] = None
+    is_prompts: bool = True

    @classmethod
    def init_batch(cls, seqs: List["Sequence"] = None) -> "BatchInfo":
@@ -214,6 +241,7 @@ class BatchInfo:
                continue
            self.sequences_set.add(seq)

+    @property
    def is_empty(self) -> None:
        """
        Check whether sequences_set is empty.