[Inference] add logit processor and request handler (#5166)

* add logit processor and request handler

* add

* add

* add

* fix

* add search tokens and update func

* finish request handler

* add running list test

* fix test

* fix some bug

* add

* add

* fix bugs

* fix some bugs

* fix bug

* fix

* fix

* add copy fun

* del useless attn

* fix request status

---------

Co-authored-by: CjhHa1 <cjh18671720497outlook.com>
This commit is contained in:
Jianghai
2023-12-25 12:15:15 +08:00
committed by FrankLeeeee
parent 8daee26989
commit 0e616462a7
10 changed files with 463 additions and 66 deletions

View File

@@ -1,3 +1,9 @@
"""
Our config consists of two parts:
1. inference_config: configs for inference, it is a unified api that wraps all the configs for inference.
2. generation_config: configs for generation, it is inherited from huggingface.
"""
import logging
from dataclasses import dataclass
from typing import Optional, Union

View File

@@ -1,71 +1,210 @@
from typing import List
import torch
from transformers.configuration_utils import PretrainedConfig
from colossalai.inference.config import InferenceConfig
from colossalai.inference.kv_cache import KVCacheManager
from colossalai.inference.logit_processors import logit_processor
from colossalai.inference.sampler import *
from colossalai.inference.struct import BatchInfo, Sequence
class RunningList:
"""
RunningList is an structure for recording the running sequences, contains prefill and decoding list.
Prefilling samples will be hold until the actual ratio of prefill samples versus decoding samples exceeds ratio.
Args:
prefill_ratio: (float) A ratio for determing whether to perform prefill or not.
prefill: (List) List that contains default inputs, defaults to [].
"""
def __init__(self, prefill_ratio: str, prefill: List[Sequence] = None):
self.prefill_ratio = prefill_ratio
self.decoding: List[Sequence] = []
self.prefill: List[Sequence] = prefill if prefill is not None else []
def append(self, seq: Sequence):
# add seq to prefilling list first.
self.prefill.append(seq)
def find_seq(self, request_id):
for seq in self.decoding:
if request_id == seq.request_id:
return seq
for seq in self.prefill:
if request_id == seq.request_id:
return seq
return None
def remove(self, seq: Sequence):
if seq in self.decoding:
self.decoding.remove(seq)
elif seq in self.prefill:
self.prefill.remove(seq)
else:
raise ValueError(f"sequence {seq.request_id} is not in running list")
def ready_for_prefill(self):
if not self.decoding:
return len(self.prefill) > 0
return len(self.prefill) / len(self.decoding) >= self.ratio
def is_empty(self):
return not self.decoding and not self.prefill
class RequestHandler:
"""
RequestHandler is the core for handling existing requests and updating current batch.
During generation process, we call schedule function each iteration to update current batch.
Args:
inference_config: Store the configuration information related to inference.
model_config: The huggingface model config.
inference_config: Configuration for initialize and manage kv cache.
model_config: Configuration for model
"""
def __init__(self, inference_config, model_config) -> None:
def __init__(self, inference_config: InferenceConfig, model_config: PretrainedConfig) -> None:
self.inference_config = inference_config
self.model_config = model_config
self._init_cache()
self.waiting_list: List["Sequence"] = []
self.running_list: List["Sequence"] = []
self.batch = BatchInfo.init_batch()
self._init_cache(model_config)
def _init_cache(self):
"""
Initialize the cache manager with cache config.
"""
self.running_list: RunningList = RunningList(inference_config.prefill_ratio)
self.waiting_list: List[List] = [[], [], []]
self.done_list: List[Sequence] = []
self.running_batch = BatchInfo(is_prompts=False)
self.prefill_batch = BatchInfo(is_prompts=True)
def _init_cache(self, model_config):
self.cache_manager = KVCacheManager(self.inference_config, model_config)
def _has_waiting(self) -> bool:
return any(lst for lst in self.waiting_list)
def schedule(self):
"""
The main logic of request handler.
"""
# The code below is only used for testing engine and will be modified.
if self.waiting_list:
self.running_list = self.waiting_list
self.batch.add_seqs(self.running_list)
return self.batch
if self._has_waiting():
# Try to allocate cache blocks for the sequence using a priority of prompt length.
for lst in reversed(self.waiting_list):
if lst:
for seq in lst:
if seq.prompt_len > self.inference_config.max_input_len:
# If the prompt length is longer than max_input_len, abort the sequence.
self.abort_sequence(seq.request_id)
break
# Try to allocate cache blocks for the sequence.
if self.cache_manager.check_allocation(seq):
# If succeed, add the sequence to running list.
self.running_list.append(seq)
self.cache_manager.allocate_context_from_block_table(seq.block_table, seq.prompt_len)
lst.remove(seq)
def add_sequence(self, req_seq: "Sequence"):
if self.running_list.ready_for_prefill():
for seq in self.running_list.prefill:
seq.mark_running()
self.prefill_batch.init_batch(self.running_list.prefill)
return self.prefill_batch
return self.running_batch
def add_sequence(self, req: Sequence):
"""
Add the request to waiting list.
"""
self.waiting_list.append(req_seq)
assert not self._find_sequence(req.request_id), f"Sequence {req.request_id} already exists."
assert (
req.prompt_len < self.inference_config.max_input_len
), f"Sequence {req.request_id} exceeds input length limit"
def abort_sequence(self, seq_id: str):
"""
Abort the request. #TODO :implement this
"""
self._find_sequence(seq_id)
return
self.waiting_list[req.prompt_len * 3 // self.inference_config.max_input_len].append(req)
def _find_sequence(self, seq_id: str) -> "Sequence":
def abort_sequence(self, request_id: str):
"""
Find the request by seq_id.
Abort the request.
"""
seq, priority = self._find_sequence(request_id)
if seq.status.is_waiting:
seq.mark_aborted()
self.waiting_list[priority].remove(seq)
elif seq.status.is_running():
self.cache_manager.free_block_table(seq.block_table)
self.running_list.remove(seq)
else:
try:
self.done_list.remove(seq)
except:
return
def _find_sequence(self, request_id: str) -> Sequence:
"""
Find the request by request_id.
"""
for priority, lst in enumerate(self.waiting_list):
for seq in lst:
if seq.request_id == request_id:
return seq, priority
if self.running_list.find_seq(request_id):
return seq, None
return None
def _sample(self, probs: torch.Tensor, logprobs: torch.Tensor, generation_config):
if generation_config.num_beams == 1:
if generation_config.do_sample:
sample_tokens = greedy_sample(generation_config, logprobs)
else:
sample_tokens = multinomial_sample(generation_config, probs)
else:
sample_tokens = beam_search_sample(generation_config, logprobs, is_prompt=not self.prefill_batch.is_empty)
return sample_tokens
def mark_finished(self, sequence: Sequence, generation_config):
if (
sequence.output_token_id[-1] == generation_config.eos_id
or sequence.output_len >= generation_config.max_output_len
):
sequence.mark_finished()
def check_unfinished_seqs(self) -> bool:
return len(self.waiting_list) != 0 or len(self.running_list) != 0
return self._has_waiting() or not self.running_list.is_empty()
def search_tokens(self, generation_config, logits):
"""
Sample tokens for finished requests.
"""
# do logit processor
# NOTE: need to decide the granularity to process logits (sequence or batch)
for type in ["top_p", "top_k", "min_p"]:
if type in generation_config:
logits = logit_processor(type, logits)
# calculate probs
probs = torch.softmax(logits, dim=-1, dtype=torch.float)
logprobs = torch.log_softmax(logits, dim=-1, dtype=torch.float)
# sample the next tokens
sample_tokens = self._sample(probs, logprobs, generation_config)
self.running_batch.update_batch_tokens(sample_tokens)
def update(self):
"""
Update the waiting list and running list.
Update current running list and done list
"""
if not self.prefill_batch.is_empty:
self.running_list.decoding.extend(self.running_list.prefill)
self.running_batch.add_seqs(self.running_list.prefill)
self.running_list.prefill.clear()
self.prefill_batch.clear_batch()
# The code below is only used for testing engine and will be modified.
self.waiting_list = []
self.running_list = []
finished_sequences = list(self.batch.sequences_set)
for seq in self.running_batch.sequences_set:
if seq.check_finish():
self.done_list.append(seq)
self.running_list.remove(seq)
self.running_batch.sequences_set.remove(seq)
self.cache_manager.free_block_table(seq.block_table)
self.batch.clear_batch()
return finished_sequences
return self.done_list

View File

@@ -4,6 +4,7 @@ import torch
from transformers.configuration_utils import PretrainedConfig
from colossalai.inference.config import InferenceConfig
from colossalai.inference.struct import Sequence
from colossalai.logging import get_dist_logger
from colossalai.utils import get_current_device
@@ -99,11 +100,13 @@ class KVCacheManager:
self._block_states_cum = torch.zeros(size=(self.num_blocks + 1,), dtype=torch.int64)
self._block_finder = torch.zeros((self.num_blocks,), dtype=torch.int64)
def get_total_num_blocks(self) -> int:
@property
def total_num_blocks(self) -> int:
"""Get the total number of logical cache blocks."""
return self.num_blocks
def get_num_available_blocks(self) -> int:
@property
def num_available_blocks(self) -> int:
"""Get the number of available cache blocks."""
return self._available_blocks
@@ -114,6 +117,10 @@ class KVCacheManager:
# in the current batch.
return self.max_blocks_per_sequence
def check_allocation(self, seq: Sequence) -> bool:
num_blocks_needed = (seq.prompt_len + self.max_output_length + self.block_size - 1) // self.block_size
return num_blocks_needed <= self.num_available_blocks
def get_block_kv_ptrs(self, block_id: int, layer_id: int) -> Tuple[List[int], List[int]]:
"""Get the key and value pointers of physical caches (of specific layer) corresponding to a logical cache block."""
block: CacheBlock = self._cache_blocks[block_id]

View File

@@ -0,0 +1,66 @@
import torch
import torch.nn.functional as F
_LOGIT_PROCESSOR_MAP = {}
def register_logit_processor(process_type):
"""
register flops computation function for operation.
"""
def register(func):
global _LOGIT_PROCESSOR_MAP
_LOGIT_PROCESSOR_MAP[process_type] = func
return func
return register
@register_logit_processor("top_k")
def top_k_logit_processor(logits, top_k: int):
"""
top_k logit processor
"""
indices_to_remove = logits < torch.topk(logits, top_k)[0][..., -1, None]
logits[indices_to_remove] = -float("inf")
return logits
@register_logit_processor("top_p")
def top_p_logit_processor(logits, top_p: float):
"""
top_p logit processor
"""
sorted_logits, sorted_indices = torch.sort(logits, descending=True)
cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
sorted_indices_to_remove = cumulative_probs > top_p
sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
sorted_indices_to_remove[..., 0] = 0
indices_to_remove = sorted_indices_to_remove.scatter(dim=1, index=sorted_indices, src=sorted_indices_to_remove)
logits[indices_to_remove] = -float("inf")
return logits
def logit_processor(processor:str, logits , attrs):
"""
do logit process for given logits.
Args:
processor(str): the type of logit processor
logits(torch.Tensor): input logits
attrs(dict): attrs of the logit processor
Returns:
logits after process
"""
if processor not in _LOGIT_PROCESSOR_MAP:
return logits
else:
func = _LOGIT_PROCESSOR_MAP[processor]
try:
logits = func(logits, attrs)
except Exception as e:
return logits
return logits

View File

@@ -0,0 +1,62 @@
from typing import List, Tuple
import torch
def greedy_sample(
generation_config,
logprobs: torch.Tensor,
) -> torch.Tensor:
"""
Sample tokens greedyly.
"""
results = torch.argmax(logprobs, dim=-1).cpu()
return results
def multinomial_sample(
generation_config,
probs: torch.Tensor,
) -> torch.Tensor:
"""
Sample tokens in a random phase.
"""
max_best_of = generation_config.best_of
random_results = torch.multinomial(probs, num_samples=max_best_of, replacement=True).cpu()
return random_results
def beam_search_sample(
generation_config,
logprobs: torch.Tensor,
is_prompt: bool = False,
) -> List[Tuple[List[int], List[int]]]:
"""
Sample tokens with beam search.
We sample 2 * beam_width candidates to make sure that with high probability we can get `beam_width` candidates in addition to
the finished sequences for the next iteration.
ref:
https://github.com/tensorflow/tensor2tensor/blob/bafdc1b67730430d38d6ab802cbd51f9d053ba2e/tensor2tensor/utils/beam_search.py#L557-L563
for details. See also HF reference:
https://github.com/huggingface/transformers/blob/a4dd53d88e4852f023332d284ff07a01afcd5681/src/transformers/generation/utils.py#L3063-L3065
# NOTE: this beam search sample function is wrong now.
"""
beam_width = generation_config.best_of
results = []
if is_prompt:
# Prompt phase.
parent_ids = [0] * (2 * beam_width)
_, next_token_ids = torch.topk(logprobs[0], 2 * beam_width)
next_token_ids = next_token_ids.tolist()
else:
# Generation phase.
# cumulative_logprobs = [seq_data[seq_id].cumulative_logprob for seq_id in seq_ids]
cumulative_logprobs = torch.tensor(logprobs, dtype=torch.float, device=seq_group_logprobs.device)
seq_group_logprobs = seq_group_logprobs + cumulative_logprobs.unsqueeze(dim=1)
_, topk_ids = torch.topk(logprobs.flatten(), 2 * beam_width)
results.append((next_token_ids, parent_ids))
return results

View File

@@ -1,6 +1,6 @@
import enum
from dataclasses import dataclass
from typing import List, Union
from typing import Any, List, Union
import torch
from ordered_set import OrderedSet
@@ -21,8 +21,7 @@ class RequestStatus(enum.Enum):
# running status
WAITING = enum.auto()
PREFILL = enum.auto()
TOKEN = enum.auto()
RUNNING = enum.auto()
ABORTED = enum.auto()
# completion status
@@ -40,10 +39,7 @@ class RequestStatus(enum.Enum):
@staticmethod
def is_running(status: "RequestStatus") -> bool:
return status in [
RequestStatus.PREFILL,
RequestStatus.TOKEN,
]
return status == RequestStatus.RUNNING
@staticmethod
def is_waiting(status: "RequestStatus") -> bool:
@@ -69,7 +65,7 @@ class Sequence:
prompt: str
input_token_id: List[int]
block_size: int
sample_params: any # SampleParams needs to be imported later.
sample_params: Any # SampleParams needs to be imported later.
block_table: torch.Tensor
eos_token_id: int
max_output_len: int = 256
@@ -78,21 +74,31 @@ class Sequence:
self.output_token_id = []
self.status = RequestStatus.WAITING
def get_sentence_len(self) -> None:
@property
def prompt_len(self) -> int:
"""
Get length of prompts
"""
return len(self.input_token_id)
@property
def sentence_len(self) -> int:
"""
Get length of current sentence.
"""
return len(self.input_token_id) + len(self.output_token_id)
def get_input_len(self) -> None:
@property
def input_len(self) -> int:
"""
Get length of input sentence.
"""
return len(self.input_token_id)
def get_output_len(self) -> None:
@property
def output_len(self) -> int:
"""
Get output length of current sentence.
Get length of output sentence.
"""
return len(self.output_token_id)
@@ -116,12 +122,32 @@ class Sequence:
def __hash__(self):
return hash(self.request_id)
def mark_running(self) -> None:
"""
Set status for prefill reqs.
"""
assert self.status == RequestStatus.WAITING, "Sequence is not in WAITTING STATUS"
self.status = RequestStatus.RUNNING
def mark_finished(self) -> None:
"""
Set status for finished reqs.
"""
self.status = RequestStatus.COMPLETED
def mark_aborted(self) -> None:
"""
Set status for aborted reqs.
"""
self.status = RequestStatus.ABORTED
def __repr__(self) -> str:
return (
f"Request ID(request_id={self.request_id}, "
f"prompt={self.prompt}, "
f"status={self.status.name}, "
f"sample_params={self.sample_params}"
f"sample_params={self.sample_params}, "
f"logical block number={len(self.block_table_index)}"
)
@@ -131,7 +157,8 @@ class BatchInfo:
Information to be passed and used for a batch of sequences.
"""
sequences_set: OrderedSet["Sequence"]
sequences_set: OrderedSet["Sequence"] = None
is_prompts: bool = True
@classmethod
def init_batch(cls, seqs: List["Sequence"] = None) -> "BatchInfo":
@@ -214,6 +241,7 @@ class BatchInfo:
continue
self.sequences_set.add(seq)
@property
def is_empty(self) -> None:
"""
Check whether sequences_set is empty.