[pipeline] test pure pipeline process using llama (#4218)

* bloom policy

* llama pipeline forward and tests

* fix the output and attention_mask

* fix name

* bind argument to policy

* Revert "bloom policy"

This reverts commit 8dee68a0a2.

This policy should be revert and copied to feature/bloom

* revert the bloom changes

* cancel unneeded inputs

* gpt

* finish llama

* causal lm and sequence classification

* revision

* add pure pipeline test

* fixed version

* fixed version

* pure pipeline
This commit is contained in:
Jianghai
2023-07-25 14:31:21 +08:00
committed by Hongxin Liu
parent 36e546b2cc
commit d0807122e2
2 changed files with 30 additions and 18 deletions

View File

@@ -9,6 +9,7 @@ import torch
import torch.distributed as dist
from torch.distributed import ProcessGroup
from torch.distributed import distributed_c10d as c10d
from version_parser.version import Version
from .stage_manager import PipelineStageManager
@@ -61,17 +62,6 @@ def _broadcast_object_list(object_list: List[Any],
c10d._warn_not_in_group("broadcast_object_list")
return
my_rank = dist.get_rank()
# Serialize object_list elements to tensors on src rank.
if my_rank == src:
if torch.__version__ >= "1.13.0":
tensor_list, size_list = zip(*[c10d._object_to_tensor(obj, device=device) for obj in object_list])
else:
tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
object_sizes_tensor = torch.cat(size_list)
else:
object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
is_nccl_backend = c10d._check_for_nccl_backend(group)
current_device = None
@@ -83,6 +73,18 @@ def _broadcast_object_list(object_list: List[Any],
current_device = torch.device("cpu")
if is_nccl_backend:
current_device = torch.device("cuda", torch.cuda.current_device())
my_rank = dist.get_rank()
# Serialize object_list elements to tensors on src rank.
if my_rank == src:
if Version(torch.__version__) >= Version("1.13.0"):
tensor_list, size_list = zip(*[c10d._object_to_tensor(obj, device=current_device) for obj in object_list])
else:
tensor_list, size_list = zip(*[c10d._object_to_tensor(obj) for obj in object_list])
object_sizes_tensor = torch.cat(size_list)
else:
object_sizes_tensor = torch.empty(len(object_list), dtype=torch.long)
if is_nccl_backend:
object_sizes_tensor = object_sizes_tensor.to(current_device)