mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-03 10:06:44 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -17,9 +17,21 @@ from .sharding import (
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
'BroadcastType', 'get_broadcast_shape', 'is_broadcastable', 'recover_sharding_spec_for_broadcast_shape',
|
||||
'generate_resharding_costs', 'generate_sharding_spec', 'ignore_sharding_exception', 'check_sharding_spec_validity'
|
||||
'transpose_partition_dim', 'update_partition_dim', 'enumerate_all_possible_1d_sharding',
|
||||
'enumerate_all_possible_2d_sharding', 'generate_sharding_size', 'comm_actions_for_oprands', 'pytree_map',
|
||||
'detect_reshape_mapping', 'check_keep_sharding_status', 'infer_output_dim_partition_dict'
|
||||
"BroadcastType",
|
||||
"get_broadcast_shape",
|
||||
"is_broadcastable",
|
||||
"recover_sharding_spec_for_broadcast_shape",
|
||||
"generate_resharding_costs",
|
||||
"generate_sharding_spec",
|
||||
"ignore_sharding_exception",
|
||||
"check_sharding_spec_validity" "transpose_partition_dim",
|
||||
"update_partition_dim",
|
||||
"enumerate_all_possible_1d_sharding",
|
||||
"enumerate_all_possible_2d_sharding",
|
||||
"generate_sharding_size",
|
||||
"comm_actions_for_oprands",
|
||||
"pytree_map",
|
||||
"detect_reshape_mapping",
|
||||
"check_keep_sharding_status",
|
||||
"infer_output_dim_partition_dict",
|
||||
]
|
||||
|
@@ -14,8 +14,11 @@ from colossalai.tensor.comm_spec import CollectiveCommPattern, CommSpec
|
||||
from colossalai.tensor.sharding_spec import ShardingSpec
|
||||
|
||||
__all__ = [
|
||||
'BroadcastType', 'is_broadcastable', 'get_broadcast_shape', 'recover_sharding_spec_for_broadcast_shape',
|
||||
'comm_actions_for_oprands'
|
||||
"BroadcastType",
|
||||
"is_broadcastable",
|
||||
"get_broadcast_shape",
|
||||
"recover_sharding_spec_for_broadcast_shape",
|
||||
"comm_actions_for_oprands",
|
||||
]
|
||||
|
||||
|
||||
@@ -41,7 +44,7 @@ def get_broadcast_shape(shape1: torch.Size, shape2: torch.Size) -> List[int]:
|
||||
"""
|
||||
Compute the broadcast shape given two shapes.
|
||||
"""
|
||||
assert is_broadcastable(shape1, shape2), f'{shape1} and {shape2} are not broadcastable'
|
||||
assert is_broadcastable(shape1, shape2), f"{shape1} and {shape2} are not broadcastable"
|
||||
shape1_reverse = shape1[::-1]
|
||||
shape2_reverse = shape2[::-1]
|
||||
min_common_dim = min(len(shape1), len(shape2))
|
||||
@@ -60,8 +63,9 @@ def get_broadcast_dim_info(logical_shape, physical_shape):
|
||||
logical_num_dims = len(logical_shape)
|
||||
physical_num_dims = len(physical_shape)
|
||||
|
||||
assert logical_num_dims >= physical_num_dims, \
|
||||
'The number of dimensions in the logical shape is smaller than that of the physical shape, this tensor is not broadcast!'
|
||||
assert (
|
||||
logical_num_dims >= physical_num_dims
|
||||
), "The number of dimensions in the logical shape is smaller than that of the physical shape, this tensor is not broadcast!"
|
||||
|
||||
# track the dim and its broadcasting type
|
||||
logical_dim_broadcast_info = {}
|
||||
@@ -85,8 +89,9 @@ def get_broadcast_dim_info(logical_shape, physical_shape):
|
||||
return logical_dim_broadcast_info
|
||||
|
||||
|
||||
def recover_sharding_spec_for_broadcast_shape(logical_sharding_spec: ShardingSpec, logical_shape: torch.Size,
|
||||
physical_shape: torch.Size) -> ShardingSpec:
|
||||
def recover_sharding_spec_for_broadcast_shape(
|
||||
logical_sharding_spec: ShardingSpec, logical_shape: torch.Size, physical_shape: torch.Size
|
||||
) -> ShardingSpec:
|
||||
"""
|
||||
This function computes the sharding spec for the physical shape of a broadcast tensor.
|
||||
|
||||
@@ -124,15 +129,18 @@ def recover_sharding_spec_for_broadcast_shape(logical_sharding_spec: ShardingSpe
|
||||
physical_dim = physical_num_dims - (logical_num_dims - shape_dim)
|
||||
physical_dim_partition[physical_dim] = mesh_dim
|
||||
|
||||
physical_sharding_spec = ShardingSpec(device_mesh=logical_sharding_spec.device_mesh,
|
||||
entire_shape=physical_shape,
|
||||
dim_partition_dict=physical_dim_partition)
|
||||
physical_sharding_spec = ShardingSpec(
|
||||
device_mesh=logical_sharding_spec.device_mesh,
|
||||
entire_shape=physical_shape,
|
||||
dim_partition_dict=physical_dim_partition,
|
||||
)
|
||||
|
||||
return physical_sharding_spec, removed_dims
|
||||
|
||||
|
||||
def comm_actions_for_oprands(node: Node, removed_dims: List[int], op_data: OperationData,
|
||||
sharding_spec: ShardingSpec) -> CommAction:
|
||||
def comm_actions_for_oprands(
|
||||
node: Node, removed_dims: List[int], op_data: OperationData, sharding_spec: ShardingSpec
|
||||
) -> CommAction:
|
||||
"""
|
||||
This method is used to generate communication actions for oprands which lose information
|
||||
during convert logical shape to physical shape.
|
||||
@@ -140,9 +148,11 @@ def comm_actions_for_oprands(node: Node, removed_dims: List[int], op_data: Opera
|
||||
if len(removed_dims) == 1:
|
||||
# if list length is 1, extract element from list to avoid using flatten device mesh
|
||||
removed_dims = removed_dims[0]
|
||||
comm_spec = CommSpec(comm_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||||
sharding_spec=sharding_spec,
|
||||
logical_process_axis=removed_dims)
|
||||
comm_spec = CommSpec(
|
||||
comm_pattern=CollectiveCommPattern.IDENTITY_FWD_ALLREDUCE_BWD,
|
||||
sharding_spec=sharding_spec,
|
||||
logical_process_axis=removed_dims,
|
||||
)
|
||||
if op_data.type == OperationDataType.PARAM:
|
||||
comm_type = CommType.HOOK
|
||||
else:
|
||||
@@ -151,7 +161,7 @@ def comm_actions_for_oprands(node: Node, removed_dims: List[int], op_data: Opera
|
||||
for index, arg in enumerate(node.args):
|
||||
if op_data.name == str(arg):
|
||||
arg_index = index
|
||||
assert arg_index >= 0, f'op_data should be an argument of node.'
|
||||
assert arg_index >= 0, f"op_data should be an argument of node."
|
||||
comm_action = CommAction(
|
||||
comm_spec=comm_spec,
|
||||
comm_type=comm_type,
|
||||
|
@@ -14,11 +14,12 @@ from colossalai.tensor.sharding_spec import ShardingSpec
|
||||
|
||||
from ..constants import INFINITY_COST
|
||||
|
||||
__all__ = ['generate_sharding_spec', 'generate_resharding_costs']
|
||||
__all__ = ["generate_sharding_spec", "generate_resharding_costs"]
|
||||
|
||||
|
||||
def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh,
|
||||
dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
|
||||
def generate_sharding_spec(
|
||||
input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh, dim_partition_dict: Dict[int, List[int]]
|
||||
) -> ShardingSpec:
|
||||
"""
|
||||
Generate the sharding spec of the tensor based on the given dim_partition_dict.
|
||||
|
||||
@@ -30,7 +31,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
|
||||
"""
|
||||
|
||||
if isinstance(input_, Node):
|
||||
assert hasattr(input_, '_meta_data'), f'The given node has no attribute _meta_data'
|
||||
assert hasattr(input_, "_meta_data"), f"The given node has no attribute _meta_data"
|
||||
meta_tensor = input_._meta_data
|
||||
assert meta_tensor is not None, "The given node's _meta_data attribute is None"
|
||||
shape = meta_tensor.shape
|
||||
@@ -38,24 +39,27 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
|
||||
shape = input_.shape
|
||||
else:
|
||||
raise TypeError(
|
||||
f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.'
|
||||
f"We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected."
|
||||
)
|
||||
for dim_index, sharding_index_list in dim_partition_dict.items():
|
||||
sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
|
||||
sharding_size = reduce(operator.mul, sharding_list, 1)
|
||||
assert shape[
|
||||
dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
|
||||
assert (
|
||||
shape[dim_index] % sharding_size == 0
|
||||
), f"we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions."
|
||||
|
||||
sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict)
|
||||
return sharding_spec
|
||||
|
||||
|
||||
def generate_resharding_costs(nodes: List[Node],
|
||||
sharding_specs: List[ShardingSpec],
|
||||
count_backward: Optional[bool] = True,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
index=None):
|
||||
'''
|
||||
def generate_resharding_costs(
|
||||
nodes: List[Node],
|
||||
sharding_specs: List[ShardingSpec],
|
||||
count_backward: Optional[bool] = True,
|
||||
dtype: Optional[torch.dtype] = None,
|
||||
index=None,
|
||||
):
|
||||
"""
|
||||
Compute the resharding costs with this specific strategy.
|
||||
|
||||
Argument:
|
||||
@@ -63,7 +67,7 @@ def generate_resharding_costs(nodes: List[Node],
|
||||
sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
|
||||
count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
|
||||
dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
|
||||
'''
|
||||
"""
|
||||
# The resharding_cost of weight is counted due to sharing weight cases.
|
||||
resharding_costs = {}
|
||||
size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
|
||||
@@ -76,38 +80,39 @@ def generate_resharding_costs(nodes: List[Node],
|
||||
for strategy in input_node.strategies_vector:
|
||||
input_sharding_spec = strategy.output_sharding_spec
|
||||
if not isinstance(input_sharding_spec, ShardingSpec):
|
||||
assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.'
|
||||
assert isinstance(input_sharding_spec, list), "only ShardingSpec or List[ShardingSpec] is expected."
|
||||
input_sharding_spec = input_sharding_spec[index]
|
||||
assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
|
||||
assert isinstance(input_sharding_spec, ShardingSpec), f"The input node should NOT be a tuple of tensor."
|
||||
try:
|
||||
# compute the resharding cost
|
||||
_, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
|
||||
input_sharding_spec, input_spec)
|
||||
input_sharding_spec, input_spec
|
||||
)
|
||||
|
||||
# we need multiply the size of elem dtype to get correct communication cost
|
||||
resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes
|
||||
except AssertionError as e:
|
||||
warnings.warn(f'{e}')
|
||||
warnings.warn(f"{e}")
|
||||
resharding_cost = INFINITY_COST
|
||||
resharding_costs[input_node].append(resharding_cost)
|
||||
return resharding_costs
|
||||
|
||||
|
||||
def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_length_threshold: int = 20):
|
||||
'''
|
||||
"""
|
||||
Find the largest repeat blocks in the graph, whose length is larger than the threshold.
|
||||
|
||||
Args:
|
||||
gm (GraphModule): the graph module to be analyzed.
|
||||
common_length_threshold (int): the threshold of the repeat block length.
|
||||
'''
|
||||
"""
|
||||
|
||||
# graph = gm.graph
|
||||
|
||||
def _process_args(args):
|
||||
new_args = []
|
||||
for arg in args:
|
||||
if hasattr(arg, '_meta_data'):
|
||||
if hasattr(arg, "_meta_data"):
|
||||
meta_data = arg._meta_data
|
||||
else:
|
||||
meta_data = arg
|
||||
@@ -145,7 +150,7 @@ def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_lengt
|
||||
return False
|
||||
|
||||
for index, node in enumerate(node_list):
|
||||
if node.op == 'call_module':
|
||||
if node.op == "call_module":
|
||||
target = node.target
|
||||
submod = root_module.get_submodule(target)
|
||||
submod_type = type(submod)
|
||||
@@ -155,12 +160,12 @@ def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_lengt
|
||||
|
||||
new_args = _process_args(node.args)
|
||||
|
||||
if node.op != 'get_attr':
|
||||
if node.op != "get_attr":
|
||||
hash_key = (node.op, target, *new_args)
|
||||
else:
|
||||
hash_key = (node.op,)
|
||||
|
||||
setattr(node, 'hash_key', hash_key)
|
||||
setattr(node, "hash_key", hash_key)
|
||||
|
||||
hash_value_to_node_dict = {}
|
||||
|
||||
@@ -179,7 +184,7 @@ def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_lengt
|
||||
# the comparison will be triggered if a common node appears
|
||||
if len(hash_value_to_node_dict[hash(node.hash_key)]) >= 2:
|
||||
start_index_list = hash_value_to_node_dict[hash(node.hash_key)]
|
||||
check_block_list = [node_list[start:start + max_common_length] for start in start_index_list]
|
||||
check_block_list = [node_list[start : start + max_common_length] for start in start_index_list]
|
||||
|
||||
common_label = True
|
||||
if not _all_equal(check_block_list, _check_node_list_equal):
|
||||
@@ -201,6 +206,6 @@ def find_repeat_blocks(node_list: List[torch.fx.Node], root_module, common_lengt
|
||||
# recover common subgraph from the index
|
||||
common_blocks = []
|
||||
for start in common_blocks_index:
|
||||
common_blocks.append(node_list[start:start + max_common_length])
|
||||
common_blocks.append(node_list[start : start + max_common_length])
|
||||
|
||||
return common_blocks
|
||||
|
@@ -1,12 +1,12 @@
|
||||
import functools
|
||||
from typing import Any, Callable, Dict, List, Tuple, Type, Union
|
||||
from typing import Any, Callable, Tuple, Type, Union
|
||||
|
||||
import torch
|
||||
|
||||
from colossalai.logging import get_dist_logger
|
||||
from colossalai.tensor.sharding_spec import ShardingSpec, ShardingSpecException
|
||||
|
||||
__all__ = ['ignore_sharding_exception', 'pytree_map']
|
||||
__all__ = ["ignore_sharding_exception", "pytree_map"]
|
||||
|
||||
|
||||
def ignore_sharding_exception(func):
|
||||
@@ -48,29 +48,32 @@ def check_sharding_spec_validity(sharding_spec: ShardingSpec, tensor: torch.Tens
|
||||
tensor_num_dim = tensor.dim()
|
||||
num_devices_in_col = sharding_spec.device_mesh.shape[0]
|
||||
num_devices_in_row = sharding_spec.device_mesh.shape[1]
|
||||
assert sharding_len == tensor_num_dim, \
|
||||
f'The ShardingSpec ({sharding_spec.sharding_sequence}) is created for {sharding_len}-dimension tensor, but the given tensor is {tensor_num_dim}-dimension ({tensor.shape}).'
|
||||
assert (
|
||||
sharding_len == tensor_num_dim
|
||||
), f"The ShardingSpec ({sharding_spec.sharding_sequence}) is created for {sharding_len}-dimension tensor, but the given tensor is {tensor_num_dim}-dimension ({tensor.shape})."
|
||||
|
||||
# make sure the sharding is valid for each dim
|
||||
for i in range(tensor_num_dim):
|
||||
dim_size = tensor.shape[i]
|
||||
dim_spec = sharding_spec.sharding_sequence[i]
|
||||
|
||||
if str(dim_spec).startswith('S'):
|
||||
devices_str = str(dim_spec).lstrip('S')
|
||||
if str(dim_spec).startswith("S"):
|
||||
devices_str = str(dim_spec).lstrip("S")
|
||||
num_devices = 1
|
||||
|
||||
if '0' in devices_str:
|
||||
if "0" in devices_str:
|
||||
num_devices *= num_devices_in_col
|
||||
if '1' in devices_str:
|
||||
if "1" in devices_str:
|
||||
num_devices *= num_devices_in_row
|
||||
|
||||
assert dim_size >= num_devices and dim_size % num_devices == 0, \
|
||||
f'The dimension at index {i} has value {dim_size}, but it is sharded over {num_devices} devices.'
|
||||
assert (
|
||||
dim_size >= num_devices and dim_size % num_devices == 0
|
||||
), f"The dimension at index {i} has value {dim_size}, but it is sharded over {num_devices} devices."
|
||||
|
||||
# make sure the entire shape matches the physical tensor shape
|
||||
assert sharding_spec.entire_shape == tensor.shape, \
|
||||
f'The entire_shape of the sharding spec {sharding_spec.entire_shape} does not match the tensor shape {tensor.shape}'
|
||||
assert (
|
||||
sharding_spec.entire_shape == tensor.shape
|
||||
), f"The entire_shape of the sharding spec {sharding_spec.entire_shape} does not match the tensor shape {tensor.shape}"
|
||||
|
||||
|
||||
def pytree_map(obj: Any, fn: Callable, process_types: Union[Type, Tuple[Type]] = (), map_all: bool = False) -> Any:
|
||||
|
@@ -8,6 +8,7 @@ class PreviousStatus(Enum):
|
||||
"""
|
||||
This class shows the status of previous comparison.
|
||||
"""
|
||||
|
||||
RESET = 0
|
||||
# ORIGIN means the dimension size of original tensor is larger in the previous comparison.
|
||||
ORIGIN = 1
|
||||
@@ -130,8 +131,9 @@ def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> D
|
||||
return reshape_mapping_dict
|
||||
|
||||
|
||||
def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
|
||||
reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> bool:
|
||||
def check_keep_sharding_status(
|
||||
input_dim_partition_dict: Dict[int, List[int]], reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]
|
||||
) -> bool:
|
||||
"""
|
||||
This method is used to check whether the reshape operation could implement without converting
|
||||
the input to fully replicated status.
|
||||
@@ -172,14 +174,16 @@ def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
|
||||
return True
|
||||
|
||||
|
||||
def infer_output_dim_partition_dict(input_dim_partition_dict: Dict[int, List[int]],
|
||||
reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> Dict[Tuple[int], Tuple[int]]:
|
||||
def infer_output_dim_partition_dict(
|
||||
input_dim_partition_dict: Dict[int, List[int]], reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]
|
||||
) -> Dict[Tuple[int], Tuple[int]]:
|
||||
"""
|
||||
This method is used to infer the output dim partition dict for a reshape operation,
|
||||
given the input dim partition dict and reshape mapping dict.
|
||||
"""
|
||||
assert check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict), \
|
||||
'we only infer output dim partition dict for the reshape operation could keep sharding spec.'
|
||||
assert check_keep_sharding_status(
|
||||
input_dim_partition_dict, reshape_mapping_dict
|
||||
), "we only infer output dim partition dict for the reshape operation could keep sharding spec."
|
||||
sharded_dims = list(input_dim_partition_dict.keys())
|
||||
output_dim_partition_dict = {}
|
||||
for input_dims, output_dims in reshape_mapping_dict.items():
|
||||
|
@@ -8,8 +8,11 @@ import torch
|
||||
from colossalai.tensor.sharding_spec import ShardingSpec
|
||||
|
||||
__all__ = [
|
||||
'transpose_partition_dim', 'update_partition_dim', 'enumerate_all_possible_1d_sharding',
|
||||
'enumerate_all_possible_2d_sharding', 'generate_sharding_size'
|
||||
"transpose_partition_dim",
|
||||
"update_partition_dim",
|
||||
"enumerate_all_possible_1d_sharding",
|
||||
"enumerate_all_possible_2d_sharding",
|
||||
"generate_sharding_size",
|
||||
]
|
||||
|
||||
|
||||
@@ -22,8 +25,7 @@ def transpose_partition_dim(sharding_spec: ShardingSpec, dim1: int, dim2: int) -
|
||||
dim1 (int): the tensor dimension to switch
|
||||
dim2 (int): the tensor dimension to switch
|
||||
"""
|
||||
assert len(sharding_spec.entire_shape) >= 2, \
|
||||
'The entire_shape of the sharding spec must have at least 2 dimensions'
|
||||
assert len(sharding_spec.entire_shape) >= 2, "The entire_shape of the sharding spec must have at least 2 dimensions"
|
||||
dim_partition_dict = sharding_spec.dim_partition_dict
|
||||
|
||||
# transpose the dim partition
|
||||
@@ -45,10 +47,9 @@ def transpose_partition_dim(sharding_spec: ShardingSpec, dim1: int, dim2: int) -
|
||||
return sharding_spec
|
||||
|
||||
|
||||
def update_partition_dim(sharding_spec: ShardingSpec,
|
||||
dim_mapping: Dict[int, int],
|
||||
physical_shape: torch.Size,
|
||||
inplace: bool = False):
|
||||
def update_partition_dim(
|
||||
sharding_spec: ShardingSpec, dim_mapping: Dict[int, int], physical_shape: torch.Size, inplace: bool = False
|
||||
):
|
||||
"""
|
||||
This method is used to update the partition dim dict from the logical one to the physical one.
|
||||
|
||||
@@ -78,9 +79,9 @@ def update_partition_dim(sharding_spec: ShardingSpec,
|
||||
new_dim_partition_dict[tensor_dim] = mesh_dims
|
||||
|
||||
# update sharding spec
|
||||
current_sharding_spec.__init__(device_mesh=sharding_spec.device_mesh,
|
||||
entire_shape=physical_shape,
|
||||
dim_partition_dict=new_dim_partition_dict)
|
||||
current_sharding_spec.__init__(
|
||||
device_mesh=sharding_spec.device_mesh, entire_shape=physical_shape, dim_partition_dict=new_dim_partition_dict
|
||||
)
|
||||
return current_sharding_spec
|
||||
|
||||
|
||||
|
Reference in New Issue
Block a user