mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-05 19:13:01 +00:00
[Inference] Fix request handler and add recycle logic (#5260)
* fix request handler * fix comment
This commit is contained in:
@@ -208,9 +208,9 @@ class KVCacheManager:
|
||||
# The last allocated block may be either partially or fully occupied.
|
||||
# `alloc_local_block_idx` is the index of block to be allocated on provided block table.
|
||||
alloc_local_block_idx = context_len // self.block_size
|
||||
self.allocate_single_block(block_table, alloc_local_block_idx, 1)
|
||||
return self.allocate_single_block(block_table, alloc_local_block_idx)
|
||||
|
||||
def allocate_single_block(self, block_table: torch.Tensor, block_local_idx: int, space_asked: int) -> int:
|
||||
def allocate_single_block(self, block_table: torch.Tensor, block_local_idx: int) -> int:
|
||||
"""Allocate space asked on a single block in the block table, specified by the provided position id,
|
||||
and updates the provided block table with the allocated block.
|
||||
|
||||
@@ -221,11 +221,14 @@ class KVCacheManager:
|
||||
Returns:
|
||||
The remaining space required to be allocated (in other blocks).
|
||||
"""
|
||||
assert block_table.dim() == 1
|
||||
space_asked = 1
|
||||
block_global_id = block_table[block_local_idx].item()
|
||||
if block_global_id < 0:
|
||||
# Allocate a new block if the current position is not assigned a block yet
|
||||
assert self._available_blocks > 0, "No available blocks to allocate."
|
||||
if self._available_blocks <= 0:
|
||||
# No available blocks to allocate, we free current sequence and return it to
|
||||
self.free_block_table(block_table)
|
||||
return True
|
||||
free_block_id = torch.nonzero(self._block_states == 1).view(-1)[0]
|
||||
block: CacheBlock = self._cache_blocks[free_block_id]
|
||||
block.add_ref()
|
||||
@@ -235,6 +238,7 @@ class KVCacheManager:
|
||||
block_table[block_local_idx] = block_global_id
|
||||
block: CacheBlock = self._cache_blocks[block_global_id]
|
||||
return self._allocate_on_block(block, space_asked)
|
||||
# only when space asked if fully satisfied, the return value will be zero.
|
||||
|
||||
def free_block_table(self, block_table: torch.Tensor) -> None:
|
||||
"""Free the logical cache blocks for **a single sequence**."""
|
||||
@@ -269,7 +273,9 @@ class KVCacheManager:
|
||||
Returns:
|
||||
The remaining space required to be allocated (in other blocks).
|
||||
"""
|
||||
assert block.available_space > 0, "No available space on block to allocate."
|
||||
assert (
|
||||
block.available_space > 0
|
||||
), "Tried to allocate some space but found no available space left in chosen block."
|
||||
space_to_allocate = min(block.available_space, space_asked)
|
||||
block.allocate(space_to_allocate)
|
||||
return space_asked - space_to_allocate
|
||||
|
Reference in New Issue
Block a user