[shardformer] support ep for deepseek v3 (#6185)

* [feature] support ep for deepseek v3 * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix test * [shardformer] fix deepseek v3 init * [lazy] fit lora for lazy init * [example] support npu for deepseek v3 --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-09-01 01:06:00 +00:00 · 2025-02-11 16:10:25 +08:00
parent 17062c83b9
commit 2b415e5999
13 changed files with 612 additions and 22 deletions
--- a/examples/language/deepseek/benchmark.py
+++ b/examples/language/deepseek/benchmark.py
@@ -4,11 +4,13 @@ import resource
 import time
 import warnings
 from contextlib import nullcontext
+from types import MethodType

 import torch
 import torch.distributed as dist
 from data_utils import RandomDataset
 from model_utils import format_numel_str, get_model_numel
+from peft import LoraConfig
 from performance_evaluator import PerformanceEvaluator, get_profile_context
 from tqdm import tqdm
 from transformers import AutoConfig, AutoModelForCausalLM
@@ -29,7 +31,7 @@ warnings.filterwarnings("ignore")

 # We have lots of llamas for your choice!
 MODEL_CONFIGS = {
-    "100m": lambda: AutoConfig.from_pretrained(
+    "100m": AutoConfig.from_pretrained(
        "deepseek-ai/deepseek-moe-16b-base",
        max_position_embeddings=4096,
        num_hidden_layers=1,
@@ -44,20 +46,29 @@ MODEL_CONFIGS = {
        attn_implementation="flash_attention_2",
        trust_remote_code=True,
    ),
-    "7b": lambda: AutoConfig.from_pretrained(
+    "7b": AutoConfig.from_pretrained(
        "deepseek-ai/deepseek-moe-16b-base",
        max_position_embeddings=4096,
        num_hidden_layers=13,
        attn_implementation="flash_attention_2",
        trust_remote_code=True,
    ),
-    "14b": lambda: AutoConfig.from_pretrained(
+    "14b": AutoConfig.from_pretrained(
        "deepseek-ai/deepseek-moe-16b-base",
        max_position_embeddings=4096,
        num_hidden_layers=26,
        attn_implementation="flash_attention_2",
        trust_remote_code=True,
    ),
+    "v3-6b": AutoConfig.from_pretrained(
+        "deepseek-ai/DeepSeek-V3",
+        num_hidden_layers=5,
+        first_k_dense_replace=2,
+        n_routed_experts=32,
+        vocab_size=8192,
+        attn_implementation="flash_attention_2",
+        trust_remote_code=True,
+    ),
 }


@@ -119,6 +130,7 @@ def main():
        help="Sequence parallelism mode",
    )
    parser.add_argument("--debug", action="store_true", help="Enable debug mode")
+    parser.add_argument("--enable_lora", action="store_true", help="Enable LoRA")
    args = parser.parse_args()

    colossalai.launch_from_torch()
@@ -151,7 +163,7 @@ def main():
            sp_size=args.sp,
            sequence_parallelism_mode=args.sp_mode,
            enable_sequence_parallelism=args.sp > 1,
-            enable_fused_normalization=torch.cuda.is_available(),
+            enable_fused_normalization=get_accelerator().is_available(),
            enable_flash_attention=args.xformers,
            microbatch_size=args.mbs,
            precision="bf16",
@@ -171,7 +183,10 @@ def main():
    # ==============================
    dp_size = getattr(plugin, "dp_size", coordinator.world_size)

-    config = MODEL_CONFIGS[args.config]()
+    if args.config in MODEL_CONFIGS:
+        config = MODEL_CONFIGS[args.config]
+    else:
+        config = AutoConfig.from_pretrained(args.config, trust_remote_code=True)

    torch.cuda.manual_seed(42)

@@ -189,11 +204,25 @@ def main():
        else nullcontext()
    )

+    attn_impl = "eager" if get_accelerator().name == "npu" else "flash_attention_2"
    with init_ctx:
-        model = AutoModelForCausalLM.from_config(config, trust_remote_code=True).to(torch.bfloat16)
+        model = AutoModelForCausalLM.from_config(
+            config, trust_remote_code=True, attn_implementation=attn_impl, torch_dtype=torch.bfloat16
+        ).to(torch.bfloat16)
+        if args.enable_lora:
+            booster.enable_lora(
+                model,
+                lora_config=LoraConfig(task_type="CAUSAL_LM", target_modules=["gate_proj", "up_proj", "down_proj"]),
+            )

    if args.grad_checkpoint:
        model.gradient_checkpointing_enable()
+    if model.__class__.__name__.startswith("DeepseekV3"):
+        model.eval()
+        # enable grad for moe layers
+        for m in model.modules():
+            if m.__class__.__name__ == "DeepseekV3MoE":
+                m.moe_infer = MethodType(m.moe_infer.__wrapped__, m)

    model_numel = get_model_numel(model)
    coordinator.print_on_master(f"Model params: {format_numel_str(model_numel)}")
--- a/examples/language/performance_evaluator.py
+++ b/examples/language/performance_evaluator.py
@@ -7,6 +7,7 @@ from torch import Tensor
 from torch.profiler import ProfilerActivity, profile, schedule, tensorboard_trace_handler

 from colossalai.cluster import DistCoordinator
+from colossalai.utils import get_current_device


 def divide(x: float, y: float) -> float:
@@ -29,7 +30,7 @@ def all_reduce_mean(x: float, world_size: int) -> float:
    # tensor = tensor / world_size
    # return tensor.item()

-    tensor = torch.tensor([x], device=torch.cuda.current_device(), dtype=torch.float)
+    tensor = torch.tensor([x], device=get_current_device(), dtype=torch.float)
    dist.all_reduce(tensor)
    tensor = tensor / world_size
    return tensor.item()