[Feat]Tensor Model Parallel Support For Inference (#5563)

* tensor parallel support naive source * [fix]precision, model load and refactor the framework * add tp unit test * docstring * fix do_sample
2025-09-05 19:13:01 +00:00 · 2024-04-18 16:56:46 +08:00
parent be396ad6cc
commit e37ee2fb65
8 changed files with 640 additions and 150 deletions
--- a/colossalai/inference/modeling/policy/nopadding_llama.py
+++ b/colossalai/inference/modeling/policy/nopadding_llama.py
@@ -1,4 +1,3 @@
-from torch.nn import Parameter
 from transformers.models.llama.modeling_llama import LlamaDecoderLayer, LlamaForCausalLM, LlamaModel, LlamaRMSNorm

 from colossalai.inference.modeling.models.nopadding_llama import (
@@ -10,6 +9,7 @@ from colossalai.inference.modeling.models.nopadding_llama import (
    llama_rmsnorm_forward,
 )
 from colossalai.inference.utils import init_to_get_rotary
+from colossalai.shardformer.layer import Linear1D_Col, Linear1D_Row
 from colossalai.shardformer.policies.base_policy import ModulePolicyDescription, SubModuleReplacementDescription
 from colossalai.shardformer.policies.llama import LlamaForCausalLMPolicy

@@ -21,26 +21,69 @@ class NoPaddingLlamaModelInferPolicy(LlamaForCausalLMPolicy):
    def module_policy(self):
        policy = super().module_policy()

-        decoder_attribute_replacement = {
-            "lm_head.weight": Parameter(self.model.lm_head.weight.transpose(0, 1), requires_grad=False),
-        }
-        policy[LlamaForCausalLM] = ModulePolicyDescription(
-            attribute_replacement=decoder_attribute_replacement,
-        )
+        if self.shard_config.enable_tensor_parallelism:
+            decoder_attribute_replacement = {
+                "self_attn.hidden_size": self.model.config.hidden_size // self.shard_config.tensor_parallel_size,
+                "self_attn.num_heads": self.model.config.num_attention_heads // self.shard_config.tensor_parallel_size,
+            }
+            if getattr(self.model.config, "num_key_value_heads", False):
+                decoder_attribute_replacement["self_attn.num_key_value_heads"] = (
+                    self.model.config.num_key_value_heads // self.shard_config.tensor_parallel_size
+                )
+        else:
+            decoder_attribute_replacement = None

        policy[LlamaDecoderLayer] = ModulePolicyDescription(
+            attribute_replacement=decoder_attribute_replacement,
            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="mlp.gate_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="mlp.up_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="mlp.down_proj",
+                    target_module=Linear1D_Row,
+                ),
                SubModuleReplacementDescription(
                    suffix="mlp",
                    target_module=NopadLlamaMLP,
                ),
+                SubModuleReplacementDescription(
+                    suffix="self_attn.q_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="self_attn.k_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="self_attn.v_proj",
+                    target_module=Linear1D_Col,
+                ),
+                SubModuleReplacementDescription(
+                    suffix="self_attn.o_proj",
+                    target_module=Linear1D_Row,
+                ),
                SubModuleReplacementDescription(
                    suffix="self_attn",
                    target_module=NopadLlamaAttention,
                ),
-            ]
+            ],
        )

+        policy[LlamaForCausalLM] = ModulePolicyDescription(
+            sub_module_replacement=[
+                SubModuleReplacementDescription(
+                    suffix="lm_head", target_module=Linear1D_Col, kwargs={"gather_output": True}
+                )
+            ],
+        )
+
+        # self.shard_config._infer()
        self.append_or_create_method_replacement(
            description={"forward": llama_causal_lm_forward}, policy=policy, target_key=LlamaForCausalLM
        )