mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-15 22:19:38 +00:00
[doc] Fix typo under colossalai and doc(#3618)
* Fixed several spelling errors under colossalai * Fix the spelling error in colossalai and docs directory * Cautious Changed the spelling error under the example folder * Update runtime_preparation_pass.py revert autograft to autograd * Update search_chunk.py utile to until * Update check_installation.py change misteach to mismatch in line 91 * Update 1D_tensor_parallel.md revert to perceptron * Update 2D_tensor_parallel.md revert to perceptron in line 73 * Update 2p5D_tensor_parallel.md revert to perceptron in line 71 * Update 3D_tensor_parallel.md revert to perceptron in line 80 * Update README.md revert to resnet in line 42 * Update reorder_graph.py revert to indice in line 7 * Update p2p.py revert to megatron in line 94 * Update initialize.py revert to torchrun in line 198 * Update routers.py change to detailed in line 63 * Update routers.py change to detailed in line 146 * Update README.md revert random number in line 402
This commit is contained in:
@@ -184,7 +184,7 @@ class ColoTensor(torch.Tensor):
|
||||
# we have to capture the `backward` function
|
||||
# and make sure that it does not in `torch._C.DisableTorchFunction()` context
|
||||
if func is torch.Tensor.backward:
|
||||
assert len(args) == 1 # only has 1 paramter
|
||||
assert len(args) == 1 # only has 1 parameter
|
||||
backward_tensor = torch.Tensor(args[0])
|
||||
tensor_kwargs = {k: torch.Tensor(v) if torch.is_tensor(v) else v for k, v in kwargs.items()}
|
||||
return backward_tensor.backward(**tensor_kwargs)
|
||||
@@ -228,7 +228,7 @@ class ColoTensor(torch.Tensor):
|
||||
2. If the pg is not not None and not equal to the current process group.
|
||||
First, convert the tensor as replicated among the TP process group.
|
||||
Second, reset the process group to the new pg.
|
||||
Third, conver the tensor (new replicated both among the tp process group) to the new dist_spec.
|
||||
Third, convert the tensor (new replicated both among the tp process group) to the new dist_spec.
|
||||
|
||||
Args:
|
||||
dist_spec (_DistSpec): the new dist spec.
|
||||
@@ -297,7 +297,7 @@ class ColoTensor(torch.Tensor):
|
||||
def size_global(self, *args) -> torch.Size:
|
||||
"""size_global
|
||||
|
||||
override the torch buildin size()
|
||||
override the torch building size()
|
||||
the shape passed in must be in a replicate placement.
|
||||
|
||||
Returns:
|
||||
|
@@ -391,7 +391,7 @@ class CommSpec:
|
||||
to determine the buffer shape, and logical_process_axis
|
||||
|
||||
Argument:
|
||||
comm_pattern(CollectiveCommPattern): decribe the communication method used in this spec.
|
||||
comm_pattern(CollectiveCommPattern): describe the communication method used in this spec.
|
||||
sharding_spec(ShardingSpec): This is sharding spec of the tensor which will join the communication action.
|
||||
gather_dim(int, Optional): The gather_dim of the tensor will be gathered.
|
||||
shard_dim(int, Optional): The shard_dim of the tensor will be sharded.
|
||||
|
@@ -10,7 +10,7 @@ class ComputePattern(Enum):
|
||||
|
||||
class ComputeSpec(object):
|
||||
"""ComputeSpec
|
||||
The Specification for compuattion pattern
|
||||
The Specification for computation pattern
|
||||
|
||||
Args:
|
||||
compute_pattern (ComputePattern): an Enum instance for compute pattern.
|
||||
|
@@ -14,7 +14,7 @@ class Layout:
|
||||
"""Layout of a tensor.
|
||||
|
||||
Attributes:
|
||||
device_mesh: the device mesh to store the tensor distributedly.
|
||||
device_mesh: the device mesh to store the tensor distributed.
|
||||
device_type: the type of the device mesh, e.g. 'cpu' or 'cuda'.
|
||||
sharding_spec: the sharding specification to describe how the tensor is sharded.
|
||||
entire_shape: the entire shape of the global tensor.
|
||||
|
@@ -14,7 +14,7 @@ NAN = 'nan'
|
||||
|
||||
class DimSpec:
|
||||
'''
|
||||
Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of
|
||||
Sharding spec for single dimension of the sharded tensor describe the sharding dimension of
|
||||
logical device mesh and give a method to compute the difference between them.
|
||||
This class is used internally in ShardingSpec.
|
||||
|
||||
@@ -143,7 +143,7 @@ class ShardingSpec:
|
||||
|
||||
Argument:
|
||||
dim_partition_dict(Dict[int, List[int]], optional): The key is the dimension of tensor to be sharded,
|
||||
and the value of the key decribe which logical axis will be sharded in that dimension.
|
||||
and the value of the key describe which logical axis will be sharded in that dimension.
|
||||
sharding_sequence(List[DimSpec], optional): A straight view of ShardingSpec looks like [R, R, S0, S1].
|
||||
'''
|
||||
|
||||
|
@@ -61,7 +61,7 @@ class DistSpecManager:
|
||||
Args:
|
||||
tensor (torch.Tensor): a global (replicated) tensor before shard
|
||||
dist_spec (_DistSpec): the distributed spec. to be sharded as.
|
||||
pg (ProcessGrouo): the process group of the corresponding colotensor
|
||||
pg (ProcessGroup): the process group of the corresponding colotensor
|
||||
Returns:
|
||||
torch.Tensor: a torch tensor after sharded.
|
||||
"""
|
||||
|
@@ -15,7 +15,7 @@ class _DistSpec:
|
||||
A class indicates Distributed Specification.
|
||||
The DistSpec is only works for the tensor parallel process groups.
|
||||
Because the dist spec of data parallel process group can be automatically deduced.
|
||||
This is an internal data structrue.
|
||||
This is an internal data structure.
|
||||
The API for users should be `ShardSpec` and `ReplicaSpec`.
|
||||
|
||||
Args:
|
||||
|
@@ -73,7 +73,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
|
||||
orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]:
|
||||
'''
|
||||
Get all valid sharding specs from source_spec with single all-gather operation, and
|
||||
accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
|
||||
accumulate communication cost on origin cost which will finally be used in auto sharding solver.
|
||||
For the all-gather operation, we just care about the S dimension.
|
||||
|
||||
Argument:
|
||||
@@ -145,7 +145,7 @@ class ShapeConsistencyManager(metaclass=SingletonMeta):
|
||||
orig_cost_dict: Dict[str, float]) -> Dict[ShardingSpec, float]:
|
||||
'''
|
||||
Get all valid sharding specs from source_spec with single all-to-all operation, and
|
||||
accumulate commucation cost on origin cost which will finally be used in auto sharding solver.
|
||||
accumulate communication cost on origin cost which will finally be used in auto sharding solver.
|
||||
For the all-to-all operation, we just care about the pairs containing S dimension.
|
||||
|
||||
Argument:
|
||||
|
@@ -18,7 +18,7 @@ NAN = 'nan'
|
||||
|
||||
class _DimSpec:
|
||||
'''
|
||||
Sharding spec for single dimension of the sharded tensor decribe the sharding dimension of
|
||||
Sharding spec for single dimension of the sharded tensor describe the sharding dimension of
|
||||
logical device mesh and give a method to compute the difference between them.
|
||||
This class is used internally in ShardingSpec.
|
||||
|
||||
|
@@ -18,7 +18,7 @@ def all_gather_simulator(target_pair):
|
||||
|
||||
Argument:
|
||||
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
|
||||
and the second element decribes which logical axis will be sharded in that dimension.
|
||||
and the second element describes which logical axis will be sharded in that dimension.
|
||||
'''
|
||||
_, shard_list = target_pair
|
||||
new_shard_list = shard_list[:-1]
|
||||
@@ -36,7 +36,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair):
|
||||
Therefore, if the behind shard_list is not None, we just extend it to the front shard_list.
|
||||
Argument:
|
||||
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
|
||||
and the second element decribes which logical axis will be sharded in that dimension.
|
||||
and the second element describes which logical axis will be sharded in that dimension.
|
||||
e.g.:
|
||||
all-to-all(S0, S1) -> [S01, R]
|
||||
all-to-all(S0, R) -> [R, S0]
|
||||
@@ -46,7 +46,7 @@ def all_to_all_simulator(f_target_pair, b_target_pair):
|
||||
|
||||
Argument:
|
||||
target_pair(Tuple[int, List[int]]): The first element is the dimension of tensor to be sharded,
|
||||
and the second element decribes which logical axis will be sharded in that dimension.
|
||||
and the second element describes which logical axis will be sharded in that dimension.
|
||||
'''
|
||||
_, f_shard_list = f_target_pair
|
||||
_, b_shard_list = b_target_pair
|
||||
|
Reference in New Issue
Block a user