[Fix/Inference] Fix GQA Triton and Support Llama3 (#5624)

* [fix] GQA calling of flash decoding triton * fix kv cache alloc shape * fix rotary triton - GQA * fix sequence max length assigning * Sequence max length logic * fix scheduling and spec-dec * skip without import error * fix pytest - skip without ImportError --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-09-15 22:19:38 +00:00 · 2024-04-23 13:09:55 +08:00
parent ccf72797e3
commit 5d4c1fe8f5
9 changed files with 183 additions and 194 deletions
--- a/colossalai/inference/batch_bucket.py
+++ b/colossalai/inference/batch_bucket.py
@@ -386,6 +386,7 @@ class BatchBucket:
                seq_id, seq = next(seqs_iter)
                assert seq.output_len >= n_tokens, "Revoking len exceeds the current output len of the sequence"
                seq.output_token_id = seq.output_token_id[:-n_tokens]
+                seq.revoke_finished_status()
                self._sequence_lengths[self._sequences_indexes[seq_id]] -= n_tokens

    def clear(self, free_block_tables_fn: Optional[Callable[[torch.Tensor], None]]) -> List[int]: