[pipeline] test pure pipeline process using llama (#4218)

* bloom policy * llama pipeline forward and tests * fix the output and attention_mask * fix name * bind argument to policy * Revert "bloom policy" This reverts commit 8dee68a0a2. This policy should be revert and copied to feature/bloom * revert the bloom changes * cancel unneeded inputs * gpt * finish llama * causal lm and sequence classification * revision * add pure pipeline test * fixed version * fixed version * pure pipeline
2025-09-07 20:10:17 +00:00 · 2023-07-25 14:31:21 +08:00
parent 36e546b2cc
commit d0807122e2
2 changed files with 30 additions and 18 deletions
--- a/colossalai/pipeline/p2p.py
+++ b/colossalai/pipeline/p2p.py
@@ -9,6 +9,7 @@ import torch
 import torch.distributed as dist
 from torch.distributed import ProcessGroup
 from torch.distributed import distributed_c10d as c10d
+from version_parser.version import Version

 from .stage_manager import PipelineStageManager

@@ -61,17 +62,6 @@ def _broadcast_object_list(object_list: List[Any],
        c10d._warn_not_in_group("broadcast_object_list")
        return

-    my_rank = dist.get_rank()
-    # Serialize object_list elements to tensors on src rank.
-    if my_rank == src:
-        if torch.__version__ >= "1.13.0":
-            tensor_list, size_list = zip(*[c10d._object_to_tensor(obj, device=device) for obj in object_list])
-        else:
-            tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
-        object_sizes_tensor = torch.cat(size_list)
-    else:
-        object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
-
    is_nccl_backend = c10d._check_for_nccl_backend(group)
    current_device = None

@@ -83,6 +73,18 @@ def _broadcast_object_list(object_list: List[Any],
        current_device = torch.device("cpu")
        if is_nccl_backend:
            current_device = torch.device("cuda", torch.cuda.current_device())
+
+    my_rank = dist.get_rank()
+    # Serialize object_list elements to tensors on src rank.
+    if my_rank == src:
+        if Version(torch.__version__) >= Version("1.13.0"):
+            tensor_list, size_list = zip(*[c10d._object_to_tensor(obj, device=current_device) for obj in object_list])
+        else:
+            tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
+        object_sizes_tensor = torch.cat(size_list)
+    else:
+        object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
+
    if is_nccl_backend:
        object_sizes_tensor = object_sizes_tensor.to(current_device)