[tensor] remove gpc in tensor tests (#1186)

2025-09-01 01:06:00 +00:00 · 2022-06-29 14:08:40 +08:00
parent 372f791444
commit c463f8adf9
4 changed files with 26 additions and 20 deletions
--- a/colossalai/tensor/init.py
+++ b/colossalai/tensor/init.py
@@ -1,3 +1,4 @@
+from .process_group import ProcessGroup
 from .tensor_spec import TensorSpec
 from .compute_spec import ComputeSpec, ComputePattern
 from .colo_tensor import ColoTensor
@@ -6,7 +7,6 @@ from .utils import convert_parameter, named_params_with_colotensor
 from .dist_spec_mgr import DistSpecManager
 from .param_op_hook import ParamOpHook, ParamOpHookManager
 from . import distspec
-from .process_group import ProcessGroup

 __all__ = [
    'ColoTensor', 'convert_parameter', 'ComputePattern', 'TensorSpec', 'ComputeSpec', 'named_params_with_colotensor',
--- a/colossalai/tensor/colo_tensor.py
+++ b/colossalai/tensor/colo_tensor.py
@@ -30,7 +30,7 @@ class ColoTensor(torch.Tensor):
    1. directly init.
    >>> colo_t1 = ColoTensor(torch.randn(2,3), spec = TensorSpec(distspec.replicate())
    >>> # If initializaed in a shard model, the tensor passed in is one shard of the global tensor.
-    >>> shard_spec = distspec.shard(process_group=gpc.get_group(ParallelMode.DATA), 
+    >>> shard_spec = distspec.shard(process_group=ProcessGroup(tp=world_size), 
    >>>                 dims=[0], 
    >>>                 num_partitions=[world_size])
    >>> tensor_spec = TensorSpec(shard_spec)
--- a/colossalai/tensor/process_group.py
+++ b/colossalai/tensor/process_group.py
@@ -5,7 +5,7 @@ from typing import List, Optional
 class ProcessGroup:
    """
    Process Group contains group partition for Tensor Parallel and Data Parallel.
-    WARNING, the ProcessGroup must be used after torch.distributed.initialize()
+    NOTE, the ProcessGroup must be used after torch.distributed.initialize()
    args:
        rank: the global rank of the current process.
        ranks: List[int], a list of rank id belongings to this process group.
@@ -15,16 +15,24 @@ class ProcessGroup:
    """

    def __init__(self,
-                 rank: int,
-                 ranks: List[int],
+                 rank: Optional[int] = None,
+                 ranks: Optional[List[int]] = None,
                 backend: str = 'nccl',
                 tp_degree: Optional[int] = None,
                 dp_degree: Optional[int] = None) -> None:
-        self._rank = rank
-        self._rank_list = ranks
+        assert torch.distributed.is_initialized(), f"ProcessGroup must be used after distributed initialized"
+        if rank is None:
+            self._rank = torch.distributed.get_rank()
+        else:
+            self._rank = rank
+
+        if ranks is None:
+            self._rank_list = list(range(torch.distributed.get_world_size()))
+        else:
+            self._rank_list = ranks
+
        self._backend = backend
        self._world_size = len(self._rank_list)
-        assert torch.distributed.is_initialized(), f"ProcessGroup must be used after distributed initialized"

        if dp_degree is None and tp_degree is None:
            self._dp_degree = self._world_size