[moe] test deepseek

2025-09-26 04:03:58 +00:00 · 2024-07-16 10:10:40 +00:00
parent dc583aa576
commit 74eccac0db
10 changed files with 276 additions and 68 deletions
--- a/tests/test_moe/modelling/test_deepseek.py
+++ b/tests/test_moe/modelling/test_deepseek.py
@@ -0,0 +1,133 @@
+import os
+import shutil
+from copy import deepcopy
+from typing import Tuple
+
+import pytest
+import torch
+import torch.distributed as dist
+from transformers import AutoConfig, AutoModel
+
+import colossalai
+from colossalai.booster.booster import Booster
+from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParallelPlugin
+from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
+from colossalai.testing.random import seed_all
+from tests.test_moe.moe_utils import loose_close
+from tests.test_moe.test_moe_checkpoint import check_model_equal
+
+NUM_BATCH = 4
+NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4
+HIDDEN_SIZE_PER_HEAD = 4
+NUM_HEADS = 4
+TOP_K = 1
+
+
+@parameterize("config", [(1, 1, 1)])
+def run_zero_with_original_model(config: Tuple[int, ...]):
+    stage, ep_size, tp_size = config
+    dtype = torch.float16
+
+    rank = torch.distributed.get_rank()
+    torch.cuda.set_device(dist.get_rank())
+
+    plugin = MoeHybridParallelPlugin(
+        pp_size=1,
+        tp_size=tp_size,
+        moe_tp_size=tp_size,
+        ep_size=ep_size,
+        zero_stage=stage,
+        overlap_communication=False,
+        initial_scale=1,
+        precision="fp32",
+    )
+    booster = Booster(plugin=plugin)
+
+    seed_all(10086)
+
+    config = AutoConfig.from_pretrained("deepseek-ai/deepseek-moe-16b-base", trust_remote_code=True)
+    config.hidden_size = HIDDEN_SIZE_PER_HEAD * NUM_HEADS
+    config.intermediate_size = HIDDEN_SIZE_PER_HEAD * NUM_HEADS * 2
+    config.num_hidden_layers = 2
+    config.num_attention_heads = NUM_HEADS
+    config.num_key_value_heads = NUM_HEADS
+    config.n_routed_experts = NUM_EXPERTS
+    config.num_experts_per_tok = TOP_K
+    torch_model = AutoModel.from_config(config, trust_remote_code=True).cuda().to(dtype)
+
+    torch_optimizer = torch.optim.SGD(torch_model.parameters(), lr=1)
+
+    zero_model = deepcopy(torch_model).to(dtype)
+    zero_optimizer = torch.optim.SGD(zero_model.parameters(), lr=1)
+
+    zero_model, zero_optimizer, _, _, _ = booster.boost(zero_model, zero_optimizer)
+
+    # create different input
+    seed_all(1453 + rank)
+
+    torch_model.train()
+    zero_model.train()
+    for _ in range(2):
+        input_data = torch.rand(
+            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
+        ).cuda()
+        dist.all_reduce(input_data, group=plugin.tp_group)  # tp requires duplicate input
+
+        zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
+        zero_optimizer.backward(zero_output)
+        zero_optimizer.step()
+        zero_optimizer.zero_grad()
+        dist.all_reduce(zero_output)
+
+        all_inputs = [torch.empty_like(input_data) for _ in range(dist.get_world_size())]
+        dist.all_gather(all_inputs, input_data)
+
+        torch_output_sum = 0
+        for input_data_ in all_inputs:
+            torch_output = torch_model(inputs_embeds=input_data_.to(dtype)).last_hidden_state.mean()
+            torch_output.backward()
+            torch_output_sum += torch_output.detach()
+        # avg dp grads
+        for p in torch_model.parameters():
+            if p.grad is not None:
+                p.grad /= dist.get_world_size()
+        torch_optimizer.step()
+        torch_optimizer.zero_grad()
+
+        loose_close(zero_output, torch_output_sum, dtype=dtype)
+
+    # use checkpoint to load sharded zero model
+    model_dir = "./test_deepseek"
+    if dist.get_rank() == 0:
+        os.makedirs(model_dir, exist_ok=True)
+
+    dist.barrier()
+
+    booster.save_model(zero_model, model_dir, shard=True)
+
+    dist.barrier()
+
+    saved_model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).cuda()
+    check_model_equal(torch_model, saved_model)
+
+    dist.barrier()
+    if dist.get_rank() == 0:
+        shutil.rmtree(model_dir)
+
+    print(f"{dist.get_rank()} test passed")
+
+
+def run_dist(rank, world_size, port):
+    colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
+    run_zero_with_original_model()
+
+
+@pytest.mark.dist
+@pytest.mark.parametrize("world_size", [4])
+@rerun_if_address_is_in_use()
+def test_mistral(world_size):
+    spawn(run_dist, world_size)
+
+
+if __name__ == "__main__":
+    test_mistral(world_size=4)
--- a/tests/test_moe/modelling/test_mixtral.py
+++ b/tests/test_moe/modelling/test_mixtral.py
@@ -24,16 +24,6 @@ NUM_HEADS = 4
 TOP_K = 1


-def split_grad(grad, world_size):
-    with torch.no_grad():
-        grad = grad.clone().detach().flatten()
-        padding_size = (world_size - grad.numel() % world_size) % world_size
-        if padding_size > 0:
-            grad = torch.nn.functional.pad(grad, [0, padding_size])
-        splited_grad = grad.split(grad.numel() // world_size)
-    return splited_grad
-
-
@parameterize("config", [(1, 1, 4), (1, 2, 2), (1, 4, 1)])
 def run_zero_with_original_model(config: Tuple[int, ...]):
    stage, ep_size, tp_size = config
--- a/tests/test_moe/test_moe_checkpoint.py
+++ b/tests/test_moe/test_moe_checkpoint.py
@@ -16,6 +16,7 @@ from colossalai.booster.plugin.moe_hybrid_parallel_plugin import MoeHybridParall
 from colossalai.tensor.moe_tensor.api import is_moe_tensor
 from colossalai.testing import parameterize, spawn
 from colossalai.testing.utils import spawn
+from tests.test_moe.moe_utils import loose_close

 tokens, n_experts = 7, 4
 hidden_size = 8
@@ -25,7 +26,7 @@ top_k = 2
 def check_model_equal(model1, model2):
    assert set(model1.state_dict().keys()) == set(model2.state_dict().keys())
    for i, ((name, p1), p2) in enumerate(zip(model1.named_parameters(), model2.parameters())):
-        if not torch.equal(p1.half(), p2.half()):
+        if loose_close(p1, p2, p1.dtype):
            print(f"Model parameter {name} is not equal. is_moe_tensor: {is_moe_tensor(p1)}")
            raise AssertionError(f"Model parameter {name} is not equal")

--- a/tests/test_moe/test_moe_ep_tp.py
+++ b/tests/test_moe/test_moe_ep_tp.py
@@ -21,16 +21,6 @@ NUM_HEADS = 4
 TOP_K = 2


-def split_grad(grad, world_size):
-    with torch.no_grad():
-        grad = grad.clone().detach().flatten()
-        padding_size = (world_size - grad.numel() % world_size) % world_size
-        if padding_size > 0:
-            grad = torch.nn.functional.pad(grad, [0, padding_size])
-        splited_grad = grad.split(grad.numel() // world_size)
-    return splited_grad
-
-
@parameterize("stage", [1])
@parameterize("ep_size", [1, 2, 4])
 def run_zero_with_original_model(stage: int, ep_size: int):
--- a/tests/test_moe/test_moe_ep_zero.py
+++ b/tests/test_moe/test_moe_ep_zero.py
@@ -14,21 +14,12 @@ from colossalai.testing import parameterize, rerun_if_address_is_in_use, spawn
 from colossalai.testing.random import seed_all
 from tests.test_moe.moe_utils import loose_close

-NUM_BATCH=4
+NUM_BATCH = 4
 NUM_TOK_PER_BATCH, NUM_EXPERTS = 7, 4
 HIDDEN_SIZE_PER_HEAD = 4
-NUM_HEADS=2
+NUM_HEADS = 2
 TOP_K = 1

-def split_grad(grad, world_size):
-    with torch.no_grad():
-        grad = grad.clone().detach().flatten()
-        padding_size = (world_size - grad.numel() % world_size) % world_size
-        if padding_size > 0:
-            grad = torch.nn.functional.pad(grad, [0, padding_size])
-        splited_grad = grad.split(grad.numel() // world_size)
-    return splited_grad
-

@parameterize("stage", [1])
@parameterize("ep_size", [1, 2, 4])
@@ -39,12 +30,7 @@ def run_zero_with_original_model(stage: int, ep_size: int):
    torch.cuda.set_device(dist.get_rank())

    plugin = MoeHybridParallelPlugin(
-        pp_size=1,
-        tp_size=1,
-        ep_size=ep_size,
-        zero_stage=stage,
-        overlap_communication=False,
-        initial_scale=1
+        pp_size=1, tp_size=1, ep_size=ep_size, zero_stage=stage, overlap_communication=False, initial_scale=1
    )
    booster = Booster(plugin=plugin)

@@ -81,7 +67,9 @@ def run_zero_with_original_model(stage: int, ep_size: int):
    zero_model.train()
    for _ in range(2):
        # zero-dp forward
-        input_data = torch.rand(NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True).cuda()
+        input_data = torch.rand(
+            NUM_BATCH, NUM_TOK_PER_BATCH, HIDDEN_SIZE_PER_HEAD * NUM_HEADS, requires_grad=True
+        ).cuda()
        zero_output = zero_model(inputs_embeds=input_data.to(dtype)).last_hidden_state.mean()
        # zero-dp backward
        zero_optimizer.backward(zero_output)