[shardformer] fix chatglm implementation (#5644)

* [shardformer] fix chatglm policy * [shardformer] fix chatglm flash attn * [shardformer] update readme * [shardformer] fix chatglm init * [shardformer] fix chatglm test * [pipeline] fix chatglm merge batch
2025-09-17 23:18:36 +00:00 · 2024-04-25 14:41:17 +08:00
parent 5d88ef1aaf
commit bbb2c21f16
11 changed files with 193 additions and 117 deletions
--- a/tests/kit/model_zoo/transformers/chatglm2.py
+++ b/tests/kit/model_zoo/transformers/chatglm2.py
@@ -1,7 +1,6 @@
 import torch
-
-from colossalai.shardformer.modeling.chatglm2_6b.configuration_chatglm import ChatGLMConfig
-from colossalai.shardformer.modeling.chatglm2_6b.modeling_chatglm import ChatGLMForConditionalGeneration, ChatGLMModel
+from torch.nn import init
+from transformers import AutoConfig, AutoModelForCausalLM

 from ..registry import ModelAttribute, model_zoo

@@ -34,19 +33,26 @@ loss_fn_for_chatglm_model = lambda x: torch.nn.functional.mse_loss(
 )
 loss_fn = lambda x: x["loss"]

-config = ChatGLMConfig(
+config = AutoConfig.from_pretrained(
+    "THUDM/chatglm2-6b",
+    trust_remote_code=True,
    num_layers=2,
    padded_vocab_size=65024,
    hidden_size=64,
+    ffn_hidden_size=214,
    num_attention_heads=8,
    kv_channels=16,
    rmsnorm=True,
    original_rope=True,
    use_cache=True,
+    multi_query_attention=False,
    torch_dtype=torch.float32,
 )

-infer_config = ChatGLMConfig(
+
+infer_config = AutoConfig.from_pretrained(
+    "THUDM/chatglm2-6b",
+    trust_remote_code=True,
    num_layers=2,
    padded_vocab_size=65024,
    hidden_size=128,
@@ -60,18 +66,18 @@ infer_config = ChatGLMConfig(
    torch_dtype=torch.float32,
 )

-model_zoo.register(
-    name="transformers_chatglm",
-    model_fn=lambda: ChatGLMModel(config, empty_init=False),
-    data_gen_fn=data_gen,
-    output_transform_fn=output_transform_fn,
-    loss_fn=loss_fn_for_chatglm_model,
-    model_attribute=ModelAttribute(has_control_flow=True),
-)
+
+def init_chatglm():
+    model = AutoModelForCausalLM.from_config(config, empty_init=False, trust_remote_code=True)
+    for m in model.modules():
+        if m.__class__.__name__ == "RMSNorm":
+            init.ones_(m.weight)
+    return model
+

 model_zoo.register(
    name="transformers_chatglm_for_conditional_generation",
-    model_fn=lambda: ChatGLMForConditionalGeneration(config, empty_init=False),
+    model_fn=init_chatglm,
    data_gen_fn=data_gen_for_conditional_generation,
    output_transform_fn=output_transform_fn,
    loss_fn=loss_fn,
--- a/tests/test_shardformer/test_model/_utils.py
+++ b/tests/test_shardformer/test_model/_utils.py
@@ -227,7 +227,7 @@ def check_output_hidden_state(


 def check_loss(org_loss: Tensor, sharded_loss: Tensor, atol: float = 1e-5, rtol: float = 1e-3):
-    assert torch.allclose(org_loss.float(), sharded_loss.float(), atol=atol, rtol=rtol)
+    assert_close(org_loss.float(), sharded_loss.float(), atol=atol, rtol=rtol)


 def check_weight(
--- a/tests/test_shardformer/test_model/test_shard_chatglm2.py
+++ b/tests/test_shardformer/test_model/test_shard_chatglm2.py
@@ -11,6 +11,7 @@ from tests.test_shardformer.test_model._utils import (
    build_model_from_hybrid_plugin,
    check_all_grad_tensors,
    check_loss,
+    check_output_hidden_state,
    check_weight,
    get_grad_tensors_for_check,
    run_forward_backward_with_hybrid_plugin,
@@ -103,8 +104,8 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
            atol, rtol = 5e-3, 5e-3

        # TODO: ChatGLMModel output is [S, B, H], merging batch of pipeline is wrong
-        # if org_model.__class__.__name__ == "ChatGLMModel":
-        #     check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol, dim=1)
+        if org_model.__class__.__name__ == "ChatGLMModel":
+            check_output_hidden_state(org_output, sharded_output, stage_manager, atol=atol, rtol=rtol, dim=1)

        check_loss(org_loss, sharded_loss, atol=atol, rtol=rtol)

@@ -177,14 +178,14 @@ def check_forward_backward(model_fn, data_gen_fn, output_transform_fn, loss_fn,
        {
            "tp_size": 4,
            "pp_size": 1,
-            "enable_all_optimization": True,
+            "enable_all_optimization": False,
            "use_lazy_init": False,
            "precision": "fp32",
        },
        {
            "tp_size": 2,
            "pp_size": 1,
-            "enable_all_optimization": True,
+            "enable_all_optimization": False,
            "use_lazy_init": False,
            "precision": "fp32",
        },