[pipeline,shardformer] Fix p2p efficiency in pipeline, allow skipping loading weight not in weight_map when strict=False, fix llama flash attention forward, add flop estimation by megatron in llama benchmark (#5017)

* Use p2p * Cannot bidirectonal send p2p * Refactor tensor creation and serialization in P2P communication * Fix llama forward args in flash attention * Add flop estimate from megatron * Support loading weight not in weight_map when strict=False in hybrid_parallel * Use send_forward_recv_backward, etc in 1f1b * Use dataclass for metdata Remove torch.cuda.synchronize() as suggested * Add comment about the torch.cuda.synchronize for potential error * Typo * Update hybrid_parallel_checkpoint_io.py * Update p2p.py * Update one_f_one_b.py * Update p2p.py --------- Co-authored-by: flybird11111 <1829166702@qq.com>
2025-09-08 12:30:42 +00:00 · 2023-11-16 20:15:59 +08:00
parent 28052a71fb
commit b2ad0d9e8f
6 changed files with 415 additions and 14 deletions
--- a/colossalai/pipeline/schedule/one_f_one_b.py
+++ b/colossalai/pipeline/schedule/one_f_one_b.py
@@ -127,6 +127,17 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        if not self.stage_manager.is_last_stage():
            self.comm.send_forward(output_object, next_rank)

+    def send_forward_recv_backward(self, output_object: Any, next_rank: int = None) -> Any:
+        """Sends the input tensor to the next stage and copy the gradient tensor from the next stage in pipeline.
+           For 1F1B.
+
+        Args:
+            output_object (Any): Object to be sent.
+            next_rank (int, optional): The rank of the recipient of the tensor.
+        """
+        if not self.stage_manager.is_last_stage():
+            return self.comm.send_forward_recv_backward(output_object, next_rank)
+
    def send_backward(self, input_object: Any, prev_rank: int = None) -> None:
        """Sends the gradient tensor to the previous stage in pipeline.
           For 1F1B.
@@ -138,6 +149,33 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):
        if not self.stage_manager.is_first_stage():
            self.comm.send_backward(input_object, prev_rank)

+    def send_backward_recv_forward(self, output_object: Any, prev_rank: int = None) -> Any:
+        """Sends the gradient tensor to the previous stage and copy the input tensor from the previous stage in pipeline.
+           For 1F1B.
+
+        Args:
+            output_object (Any): Object to be sent.
+            prev_rank (int, optional): The rank of the recipient of the tensor.
+        """
+        if not self.stage_manager.is_first_stage():
+            return self.comm.send_backward_recv_forward(output_object, prev_rank)
+
+    def send_forward_recv_forward(self, input_object: Any, prev_rank: int = None, next_rank: int = None) -> Any:
+        """Sends the input tensor to the next stage and copy the input tensor from the previous stage in pipeline.
+           For 1F1B.
+
+        Args:
+            input_object (Any): Object to be sent.
+            prev_rank (int, optional): The previous rank of the recipient of the tensor.
+            next_rank (int, optional): The next rank of the recipient of the tensor.
+        """
+        if self.stage_manager.is_first_stage():
+            return self.comm.send_forward(input_object, next_rank)
+        elif self.stage_manager.is_last_stage():
+            return self.comm.recv_forward(prev_rank)
+        else:
+            return self.comm.send_forward_recv_forward(input_object, prev_rank, next_rank)
+
    def forward_step(
        self,
        model: Module,
@@ -291,7 +329,6 @@ class OneForwardOneBackwardSchedule(PipelineSchedule):

                if not last_iteration:
                    input_obj = self.recv_forward()
-
            else:
                # TODO adjust here
                self.send_forward(output_obj)