[Hotfix] Fix bugs in testing continuous batching (#5270)

* fix bug * fix bugs * fix bugs * fix bugs and add padding * add funcs and fix bugs * fix typos * fix bugs * add func
2025-09-09 04:50:17 +00:00 · 2024-01-18 16:31:14 +08:00
parent 5ae9099f92
commit 9e2342bde2
6 changed files with 86 additions and 23 deletions
--- a/colossalai/inference/modeling/layers/attention.py
+++ b/colossalai/inference/modeling/layers/attention.py
@@ -69,7 +69,7 @@ def convert_kvcache(cache, lengths, block_tables, pad_id=0):
        )
        padding = seq_len - _cache.size(0)
        if padding > 0:
-            _cache = F.pad(_cache, (0, 0, 0, 0, 0, 1), value=pad_id)
+            _cache = F.pad(_cache, (0, 0, 0, 0, 0, padding), value=pad_id)
        padded_cache.append(_cache)
    return torch.stack(padded_cache, dim=0)

--- a/colossalai/inference/modeling/models/llama.py
+++ b/colossalai/inference/modeling/models/llama.py
@@ -173,7 +173,10 @@ def llama_attn_forward(
    key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
    value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)

+    kv_seq_len = max(sequence_lengths).item()
+
    cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+
    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)

    query_states = query_states.transpose(1, 2)