[pipeline]: fix p2p comm, add metadata cache and support llama interleaved pp (#5134)

* test: add more p2p tests * fix: remove send_forward_recv_forward as p2p op list need to use the same group * fix: make send and receive atomic * feat: update P2PComm fn * feat: add metadata cache in 1f1b * feat: add metadata cache in interleaved pp * feat: modify is_xx_stage fn * revert: add _broadcast_object_list * feat: add interleaved pp in llama policy * feat: set NCCL_BUFFSIZE in HybridParallelPlugin
2025-09-04 10:34:41 +00:00 · 2023-12-22 10:44:00 +08:00
parent af952673f7
commit 4fa689fca1
15 changed files with 728 additions and 446 deletions
--- a/examples/language/bert/finetune.py
+++ b/examples/language/bert/finetune.py
@@ -57,9 +57,7 @@ def evaluate_model(

    def evaluate_subset(dataloader: DataLoader):
        use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
-        is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(
-            None if not booster.plugin.stage_manager.is_interleave else -1
-        )
+        is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(ignore_chunk=True)

        accum_loss = torch.zeros(1, device=get_current_device())
        for batch in dataloader:
@@ -136,9 +134,7 @@ def train_epoch(
    coordinator: DistCoordinator,
 ):
    use_pipeline = isinstance(booster.plugin, HybridParallelPlugin) and booster.plugin.pp_size > 1
-    is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(
-        None if not booster.plugin.stage_manager.is_interleave else -1
-    )
+    is_pp_last_device = use_pipeline and booster.plugin.stage_manager.is_last_stage(ignore_chunk=True)
    print_flag = (not use_pipeline and coordinator.is_master()) or (use_pipeline and is_pp_last_device)
    total_step = len(train_dataloader)

--- a/examples/language/llama2/benchmark.py
+++ b/examples/language/llama2/benchmark.py
@@ -133,7 +133,9 @@ def main():
        plugin = HybridParallelPlugin(
            tp_size=args.tp,
            pp_size=args.pp,
+            pp_style="interleaved",
            zero_stage=args.zero,
+            num_model_chunks=2,
            enable_fused_normalization=torch.cuda.is_available(),
            num_microbatches=args.mbs,
            precision="bf16",