[Hotfix] Fix bugs in testing continuous batching (#5270)

* fix bug * fix bugs * fix bugs * fix bugs and add padding * add funcs and fix bugs * fix typos * fix bugs * add func
2025-09-07 12:01:39 +00:00 · 2024-01-18 16:31:14 +08:00
parent 5ae9099f92
commit 9e2342bde2
6 changed files with 86 additions and 23 deletions
--- a/colossalai/inference/core/request_handler.py
+++ b/colossalai/inference/core/request_handler.py
@@ -57,6 +57,9 @@ class RunningList:
    def is_empty(self):
        return not self.decoding and not self.prefill

+    def total_seq_num(self):
+        return len(self.decoding) + len(self.prefill)
+

 class RequestHandler:
    """
@@ -105,7 +108,13 @@ class RequestHandler:
                                f"the prompt(Request id = {seq.request_id}) length is longer than max_input_len, abort this sequence."
                            )
                            self.abort_sequence(seq.request_id)
+                            remove_list.append(seq)
                            break
+
+                        # stop feeding new sequence into running list to assure
+                        if self.cache_manager.num_available_blocks <= self.running_list.total_seq_num():
+                            break
+
                        # Try to allocate cache blocks for the sequence.
                        if (
                            self.cache_manager.check_allocation(seq)
@@ -115,7 +124,7 @@ class RequestHandler:
                            # If succeed, add the sequence to running list.
                            remove_list.append(seq)
                            self.running_list.append(seq)
-                            self.cache_manager.allocate_context_from_block_table(seq.block_table, seq.input_len)
+                            self.cache_manager.allocate_context_from_block_table(seq.block_table, seq.sentence_len)
                    for seq in remove_list:
                        lst.remove(seq)
        if self.running_list.ready_for_prefill():
@@ -126,7 +135,13 @@ class RequestHandler:

        if not self.running_batch.is_empty:
            for seq in self.running_batch.sequences_set:
-                self.cache_manager.allocate_token_from_block_table(seq.block_table, seq.sentence_len)
+                recycle = self.cache_manager.allocate_token_from_block_table(seq.block_table, seq.sentence_len)
+                if recycle:
+                    seq.recycle()
+                    self.running_batch.del_seq(seq)
+                    self.running_list.remove(seq)
+                    self.waiting_list[-1].append(seq)
+                    # the recycled sequences are handled with highest priority.

        return self.running_batch