[zero] add strict ddp mode (#2508)

* [zero] add strict ddp mode * [polish] add comments for strict ddp mode * [zero] fix test error
2025-09-02 09:38:05 +00:00 · 2023-01-20 14:04:38 +08:00
parent c04f183237
commit 2d1a7dfe5f
5 changed files with 31 additions and 8 deletions
--- a/examples/language/gpt/gemini/commons/model_zoo.py
+++ b/examples/language/gpt/gemini/commons/model_zoo.py
@@ -53,6 +53,14 @@ def gpt2_24b(checkpoint=True):
    return GPTLMModel(hidden_size=8192, num_layers=30, num_attention_heads=16, checkpoint=checkpoint)


+def gpt2_30b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=37, num_attention_heads=16, checkpoint=checkpoint)
+
+
+def gpt2_40b(checkpoint=True):
+    return GPTLMModel(hidden_size=8192, num_layers=50, num_attention_heads=16, checkpoint=checkpoint)
+
+
 def model_builder(model_size: str) -> callable:
    if model_size == "gpt2_medium":
        return gpt2_medium
@@ -66,6 +74,10 @@ def model_builder(model_size: str) -> callable:
        return gpt2_20b
    elif model_size == "gpt2_24b":
        return gpt2_24b
+    elif model_size == "gpt2_30b":
+        return gpt2_30b
+    elif model_size == "gpt2_40b":
+        return gpt2_40b
    else:
        raise TypeError(f"model_builder {model_size}")

--- a/examples/language/gpt/gemini/train_gpt_demo.py
+++ b/examples/language/gpt/gemini/train_gpt_demo.py
@@ -187,17 +187,18 @@ def tensor_parallelize(model: torch.nn.Module, pg: ProcessGroup):


 # Gemini + ZeRO DDP
-def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto"):
+def build_gemini(model: torch.nn.Module, pg: ProcessGroup, placement_policy: str = "auto", ddp_flag: bool = True):
    fp16_init_scale = 2**5
    gpu_margin_mem_ratio_for_auto = 0

    if version.parse(CAI_VERSION) > version.parse("0.1.10"):
        model = GeminiDDP(model,
+                          strict_ddp_mode=ddp_flag,
                          device=get_current_device(),
                          placement_policy=placement_policy,
                          pin_memory=True,
                          hidden_dim=model.config.n_embd,
-                          search_range_mb=64)
+                          search_range_mb=128)
        # configure the const policy
        if placement_policy == 'const':
            model.gemini_manager._placement_policy.set_const_memory_boundary(2 * 1024)
@@ -279,11 +280,12 @@ def main():
        tp_pg = ProcessGroup(tp_degree=args.tp_degree)
        # Tensor Parallelism (TP)
        # You should notice that v0.1.10 is not compatible with TP degree > 1
-        tensor_parallelize(model, tp_pg)
+        if args.tp_degree > 1:
+            tensor_parallelize(model, tp_pg)

        # build a Gemini model and a highly optimized cpu optimizer
        # Gemini + ZeRO DP, Note it must be used after TP
-        model, optimizer = build_gemini(model, tp_pg, args.placement)
+        model, optimizer = build_gemini(model, tp_pg, args.placement, args.tp_degree == 1)

        logger.info(get_mem_info(prefix='After init optim, '), ranks=[0])
    else: