mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 01:06:00 +00:00
[tensor] remove gpc in tensor tests (#1186)
This commit is contained in:
@@ -1,3 +1,4 @@
|
||||
from .process_group import ProcessGroup
|
||||
from .tensor_spec import TensorSpec
|
||||
from .compute_spec import ComputeSpec, ComputePattern
|
||||
from .colo_tensor import ColoTensor
|
||||
@@ -6,7 +7,6 @@ from .utils import convert_parameter, named_params_with_colotensor
|
||||
from .dist_spec_mgr import DistSpecManager
|
||||
from .param_op_hook import ParamOpHook, ParamOpHookManager
|
||||
from . import distspec
|
||||
from .process_group import ProcessGroup
|
||||
|
||||
__all__ = [
|
||||
'ColoTensor', 'convert_parameter', 'ComputePattern', 'TensorSpec', 'ComputeSpec', 'named_params_with_colotensor',
|
||||
|
@@ -30,7 +30,7 @@ class ColoTensor(torch.Tensor):
|
||||
1. directly init.
|
||||
>>> colo_t1 = ColoTensor(torch.randn(2,3), spec = TensorSpec(distspec.replicate())
|
||||
>>> # If initializaed in a shard model, the tensor passed in is one shard of the global tensor.
|
||||
>>> shard_spec = distspec.shard(process_group=gpc.get_group(ParallelMode.DATA),
|
||||
>>> shard_spec = distspec.shard(process_group=ProcessGroup(tp=world_size),
|
||||
>>> dims=[0],
|
||||
>>> num_partitions=[world_size])
|
||||
>>> tensor_spec = TensorSpec(shard_spec)
|
||||
|
@@ -5,7 +5,7 @@ from typing import List, Optional
|
||||
class ProcessGroup:
|
||||
"""
|
||||
Process Group contains group partition for Tensor Parallel and Data Parallel.
|
||||
WARNING, the ProcessGroup must be used after torch.distributed.initialize()
|
||||
NOTE, the ProcessGroup must be used after torch.distributed.initialize()
|
||||
args:
|
||||
rank: the global rank of the current process.
|
||||
ranks: List[int], a list of rank id belongings to this process group.
|
||||
@@ -15,16 +15,24 @@ class ProcessGroup:
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
rank: int,
|
||||
ranks: List[int],
|
||||
rank: Optional[int] = None,
|
||||
ranks: Optional[List[int]] = None,
|
||||
backend: str = 'nccl',
|
||||
tp_degree: Optional[int] = None,
|
||||
dp_degree: Optional[int] = None) -> None:
|
||||
self._rank = rank
|
||||
self._rank_list = ranks
|
||||
assert torch.distributed.is_initialized(), f"ProcessGroup must be used after distributed initialized"
|
||||
if rank is None:
|
||||
self._rank = torch.distributed.get_rank()
|
||||
else:
|
||||
self._rank = rank
|
||||
|
||||
if ranks is None:
|
||||
self._rank_list = list(range(torch.distributed.get_world_size()))
|
||||
else:
|
||||
self._rank_list = ranks
|
||||
|
||||
self._backend = backend
|
||||
self._world_size = len(self._rank_list)
|
||||
assert torch.distributed.is_initialized(), f"ProcessGroup must be used after distributed initialized"
|
||||
|
||||
if dp_degree is None and tp_degree is None:
|
||||
self._dp_degree = self._world_size
|
||||
|
Reference in New Issue
Block a user