mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-15 22:19:38 +00:00
[Fix/Inference] Fix GQA Triton and Support Llama3 (#5624)
* [fix] GQA calling of flash decoding triton * fix kv cache alloc shape * fix rotary triton - GQA * fix sequence max length assigning * Sequence max length logic * fix scheduling and spec-dec * skip without import error * fix pytest - skip without ImportError --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -386,6 +386,7 @@ class BatchBucket:
|
||||
seq_id, seq = next(seqs_iter)
|
||||
assert seq.output_len >= n_tokens, "Revoking len exceeds the current output len of the sequence"
|
||||
seq.output_token_id = seq.output_token_id[:-n_tokens]
|
||||
seq.revoke_finished_status()
|
||||
self._sequence_lengths[self._sequences_indexes[seq_id]] -= n_tokens
|
||||
|
||||
def clear(self, free_block_tables_fn: Optional[Callable[[torch.Tensor], None]]) -> List[int]:
|
||||
|
||||
Reference in New Issue
Block a user