[legacy] clean up legacy code (#4743)

* [legacy] remove outdated codes of pipeline (#4692) * [legacy] remove cli of benchmark and update optim (#4690) * [legacy] remove cli of benchmark and update optim * [doc] fix cli doc test * [legacy] fix engine clip grad norm * [legacy] remove outdated colo tensor (#4694) * [legacy] remove outdated colo tensor * [test] fix test import * [legacy] move outdated zero to legacy (#4696) * [legacy] clean up utils (#4700) * [legacy] clean up utils * [example] update examples * [legacy] clean up amp * [legacy] fix amp module * [legacy] clean up gpc (#4742) * [legacy] clean up context * [legacy] clean core, constants and global vars * [legacy] refactor initialize * [example] fix examples ci * [example] fix examples ci * [legacy] fix tests * [example] fix gpt example * [example] fix examples ci * [devops] fix ci installation * [example] fix examples ci
2025-09-06 19:40:28 +00:00 · 2023-09-18 16:31:06 +08:00
parent 32e7f99416
commit b5f9e37c70
342 changed files with 2919 additions and 4182 deletions
--- a/colossalai/legacy/tensor/distspec.py
+++ b/colossalai/legacy/tensor/distspec.py
@@ -0,0 +1,78 @@
+from enum import Enum
+from typing import List
+
+__all__ = ['ReplicaSpec', 'ShardSpec']
+
+
+class DistPlacementPattern(Enum):
+    REPLICATE = 'r'
+    SHARD = 's'
+
+
+class _DistSpec:
+    """_DistSpec
+
+    A class indicates Distributed Specification.
+    The DistSpec is only works for the tensor parallel process groups.
+    Because the dist spec of data parallel process group can be automatically deduced.
+    This is an internal data structure.
+    The API for users should be `ShardSpec` and `ReplicaSpec`.
+
+    Args:
+        dist_placement_pattern (DistPlacementPattern): the pattern describing how tensors are distributed among processes.
+                                                The dist_placement_pattern is picked from a limited set, now including two patterns: replicate and shard.
+        process_group (Optional[ProcessGroup], optional): the process group contains processes. Defaults to None.
+    """
+
+    def __init__(self, dist_placement_pattern: DistPlacementPattern, **meta_info):
+
+        self.placement = dist_placement_pattern
+        for k, v in meta_info.items():
+            setattr(self, k, v)
+
+    def __eq__(self, other: "_DistSpec") -> bool:
+        if dir(self) != dir(other):
+            return False
+        for attr in dir(self):
+            if not attr.startswith('__') and getattr(self, attr) != getattr(other, attr):
+                return False
+        return True
+
+    def __repr__(self) -> str:
+        attr_list = []
+        for attr in dir(self):
+            if not attr.startswith('__'):
+                attr_list.append(f'{attr}={str(getattr(self, attr))}')
+        attr_str = ", ".join(attr_list)
+        return "DistSpec(" + attr_str + ")"
+
+
+def ReplicaSpec() -> _DistSpec:
+    """ReplicaSpec
+
+    A distributed specification represents the tensor is replicated among the tensor parallel process group.
+
+    Returns:
+        _DistSpec: an replicated dist spec instance.
+    """
+    return _DistSpec(DistPlacementPattern.REPLICATE)
+
+
+def ShardSpec(dims: List[int], num_partitions: List[int]) -> _DistSpec:
+    """ShardSpec
+
+    A distributed specification represents the tensor is sharded among the tensor parallel process group.
+
+    Note:
+        Currently, only shard on one dimension is valid. In another word, dims should be of size 1.
+
+    Args:
+        dims (List[int]): a list of dimensions
+        num_partitions (List[int]): a list of partition number of each dimensions.
+
+    Returns:
+        _DistSpec: an shard dist spec instance.
+    """
+    assert isinstance(dims, list) and isinstance(num_partitions, list)
+    assert len(dims) == len(num_partitions)
+    return _DistSpec(DistPlacementPattern.SHARD, dims=tuple(dims), num_partitions=tuple(num_partitions))