[Inference]Adapt to baichuan2 13B (#5614)

* adapt to baichuan2 13B * adapt to baichuan2 13B * change BAICHUAN_MODEL_NAME_OR_PATH * fix test_decoding_attn.py * Modifications based on review comments. * change BAICHUAN_MODEL_NAME_OR_PATH * mv attn mask processes to test flash decoding * mv get_alibi_slopes baichuan modeling * fix bugs in test_baichuan.py
2025-09-06 19:40:28 +00:00 · 2024-04-25 23:11:30 +08:00
parent f342a93871
commit 3c91e3f176
10 changed files with 786 additions and 134 deletions
--- a/colossalai/inference/modeling/policy/nopadding_baichuan.py
+++ b/colossalai/inference/modeling/policy/nopadding_baichuan.py
@@ -1,12 +1,15 @@
 import torch.nn as nn
 from torch.nn import Parameter

-from colossalai.inference.modeling.models.nopadding_baichuan import NopadBaichuanAttention, NopadBaichuanMLP
+from colossalai.inference.modeling.models.nopadding_baichuan import (
+    NopadBaichuanAttention,
+    NopadBaichuanMLP,
+    baichuan_rmsnorm_forward,
+)
 from colossalai.inference.modeling.models.nopadding_llama import (
    llama_causal_lm_forward,
    llama_decoder_layer_forward,
    llama_model_forward,
-    llama_rmsnorm_forward,
 )
 from colossalai.inference.utils import init_to_get_rotary
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription
@@ -21,26 +24,30 @@ class NoPaddingBaichuanModelInferPolicy(LlamaForCausalLMPolicy):
        policy = super().module_policy()

        decoder_attribute_replacement = {
-            "lm_head.weight": Parameter(
-                nn.functional.normalize(self.model.lm_head.weight).transpose(0, 1), requires_grad=False
-            ),
+            "lm_head.weight": Parameter(nn.functional.normalize(self.model.lm_head.weight), requires_grad=False),
        }
        policy["BaichuanForCausalLM"] = ModulePolicyDescription(
            attribute_replacement=decoder_attribute_replacement,
        )

-        policy["DecoderLayer"] = ModulePolicyDescription(
-            sub_module_replacement=[
-                SubModuleReplacementDescription(
-                    suffix="mlp",
-                    target_module=NopadBaichuanMLP,
-                ),
-                SubModuleReplacementDescription(
-                    suffix="self_attn",
-                    target_module=NopadBaichuanAttention,
-                ),
-            ]
-        )
+        # used for relpacing Baichuan 7B/13B decoder layer
+        for layer_name in ["DecoderLayer", "BaichuanLayer"]:
+            policy[layer_name] = ModulePolicyDescription(
+                sub_module_replacement=[
+                    SubModuleReplacementDescription(
+                        suffix="mlp",
+                        target_module=NopadBaichuanMLP,
+                    ),
+                    SubModuleReplacementDescription(
+                        suffix="self_attn",
+                        target_module=NopadBaichuanAttention,
+                    ),
+                ]
+            )
+
+            self.append_or_create_method_replacement(
+                description={"forward": llama_decoder_layer_forward}, policy=policy, target_key=layer_name
+            )

        self.append_or_create_method_replacement(
            description={"forward": llama_causal_lm_forward}, policy=policy, target_key="BaichuanForCausalLM"
@@ -48,11 +55,9 @@ class NoPaddingBaichuanModelInferPolicy(LlamaForCausalLMPolicy):
        self.append_or_create_method_replacement(
            description={"forward": llama_model_forward}, policy=policy, target_key="BaichuanModel"
        )
+
        self.append_or_create_method_replacement(
-            description={"forward": llama_decoder_layer_forward}, policy=policy, target_key="DecoderLayer"
-        )
-        self.append_or_create_method_replacement(
-            description={"forward": llama_rmsnorm_forward}, policy=policy, target_key="RMSNorm"
+            description={"forward": baichuan_rmsnorm_forward}, policy=policy, target_key="RMSNorm"
        )

        return policy