[inference] Adapted to Rotary Embedding and RMS Norm (#5283)

* adapted to rotary_embedding * adapted to nopad rms norm * fix bugs in benchmark * fix flash_decoding.py
2025-09-08 20:40:34 +00:00 · 2024-01-22 10:55:34 +08:00
parent 6e487e7d3c
commit bfff9254ac
5 changed files with 140 additions and 43 deletions
--- a/colossalai/inference/modeling/policy/llama.py
+++ b/colossalai/inference/modeling/policy/llama.py
@@ -1,11 +1,13 @@
 from functools import partial

+import torch
 from transformers.models.llama.modeling_llama import (
    LlamaAttention,
    LlamaDecoderLayer,
    LlamaFlashAttention2,
    LlamaForCausalLM,
    LlamaModel,
+    LlamaRMSNorm,
    LlamaSdpaAttention,
 )

@@ -15,11 +17,31 @@ from colossalai.inference.modeling.models.llama import (
    llama_decoder_layer_forward,
    llama_model_forward,
 )
+from colossalai.inference.utils import init_to_get_rotary
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription

 # import colossalai
 from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy

+try:
+    from colossalai.kernel.triton import rms_layernorm
+
+    HAS_TRITON_RMSNORM = True
+except:
+    print("you should install triton from https://github.com/openai/triton")
+    HAS_TRITON_RMSNORM = False
+
+
+def get_triton_rmsnorm_forward():
+    if HAS_TRITON_RMSNORM:
+
+        def _triton_rmsnorm_forward(self: LlamaRMSNorm, hidden_states: torch.Tensor):
+            return rms_layernorm(hidden_states, self.weight.data, self.variance_epsilon)
+
+        return _triton_rmsnorm_forward
+    else:
+        return None
+

 class LlamaModelInferPolicy(LlamaForCausalLMPolicy):
    def __init__(self) -> None:
@@ -162,4 +184,18 @@ class LlamaModelInferPolicy(LlamaForCausalLMPolicy):
            description=method_replacement, policy=policy, target_key=LlamaSdpaAttention
        )

+        infer_forward = None
+        if HAS_TRITON_RMSNORM:
+            infer_forward = get_triton_rmsnorm_forward()
+
+        if infer_forward is not None:
+            method_replacement = {"forward": partial(infer_forward)}
+            self.append_or_create_method_replacement(
+                description=method_replacement, policy=policy, target_key=LlamaRMSNorm
+            )
+
        return policy
+
+    def postprocess(self):
+        init_to_get_rotary(self.model.model)
+        return self.model