mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-07-18 01:12:42 +00:00
fix typo colossalai/auto_parallel autochunk fx/passes etc. (#3808)
This commit is contained in:
parent
725365f297
commit
7f8203af69
4
.github/workflows/README.md
vendored
4
.github/workflows/README.md
vendored
@ -14,7 +14,7 @@
|
|||||||
- [Compatibility Test on Dispatch](#compatibility-test-on-dispatch)
|
- [Compatibility Test on Dispatch](#compatibility-test-on-dispatch)
|
||||||
- [Release](#release)
|
- [Release](#release)
|
||||||
- [User Friendliness](#user-friendliness)
|
- [User Friendliness](#user-friendliness)
|
||||||
- [Commmunity](#commmunity)
|
- [Community](#community)
|
||||||
- [Configuration](#configuration)
|
- [Configuration](#configuration)
|
||||||
- [Progress Log](#progress-log)
|
- [Progress Log](#progress-log)
|
||||||
|
|
||||||
@ -97,7 +97,7 @@ This workflow is triggered by manually dispatching the workflow. It has the foll
|
|||||||
| `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. |
|
| `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. |
|
||||||
| `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. |
|
| `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. |
|
||||||
|
|
||||||
### Commmunity
|
### Community
|
||||||
|
|
||||||
| Workflow Name | File name | Description |
|
| Workflow Name | File name | Description |
|
||||||
| -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- |
|
| -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- |
|
||||||
|
@ -148,7 +148,7 @@ class MetaInfoProp:
|
|||||||
graph_info.fwd_tmp = buffer_tensors
|
graph_info.fwd_tmp = buffer_tensors
|
||||||
graph_info.fwd_out = output_tensors
|
graph_info.fwd_out = output_tensors
|
||||||
|
|
||||||
# fetch other memory informations
|
# fetch other memory information
|
||||||
memory_cost = meta_info.memory_cost
|
memory_cost = meta_info.memory_cost
|
||||||
graph_info.fwd_mem_tmp = memory_cost.fwd.temp
|
graph_info.fwd_mem_tmp = memory_cost.fwd.temp
|
||||||
graph_info.fwd_mem_out = memory_cost.fwd.activation
|
graph_info.fwd_mem_out = memory_cost.fwd.activation
|
||||||
|
@ -44,7 +44,7 @@ class BatchNormStrategyGenerator(StrategyGenerator):
|
|||||||
'''
|
'''
|
||||||
Compute the computation cost per device with this specific strategy.
|
Compute the computation cost per device with this specific strategy.
|
||||||
|
|
||||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||||
'''
|
'''
|
||||||
# TODO: a constant coefficient need to be added.
|
# TODO: a constant coefficient need to be added.
|
||||||
# 1D: (L) * N * Cin
|
# 1D: (L) * N * Cin
|
||||||
|
@ -38,9 +38,9 @@ class ConvStrategyGenerator(StrategyGenerator):
|
|||||||
'''
|
'''
|
||||||
Compute the computation cost per device with this specific strategy.
|
Compute the computation cost per device with this specific strategy.
|
||||||
|
|
||||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||||
'''
|
'''
|
||||||
# TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
# TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||||
# 1D: (L) * N * Cout * Cin * kernel
|
# 1D: (L) * N * Cout * Cin * kernel
|
||||||
# 2D: (H * W) * N * Cout * Cin * kernel
|
# 2D: (H * W) * N * Cout * Cin * kernel
|
||||||
# 3D: (H * W * D) * N * Cout * Cin * kernel
|
# 3D: (H * W * D) * N * Cout * Cin * kernel
|
||||||
|
@ -34,9 +34,9 @@ class LayerNormGenerator(StrategyGenerator):
|
|||||||
'''
|
'''
|
||||||
Compute the computation cost per device with this specific strategy.
|
Compute the computation cost per device with this specific strategy.
|
||||||
|
|
||||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||||
'''
|
'''
|
||||||
# TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
# TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||||
# TODO: a constant coefficient need to be added.
|
# TODO: a constant coefficient need to be added.
|
||||||
|
|
||||||
sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()
|
sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device()
|
||||||
|
@ -17,7 +17,7 @@ class NormalPoolStrategyGenerator(StrategyGenerator):
|
|||||||
"""
|
"""
|
||||||
NormalPoolStrategyGenerator is a generic class to generate strategies for pool operation like MaxPoolxd.
|
NormalPoolStrategyGenerator is a generic class to generate strategies for pool operation like MaxPoolxd.
|
||||||
The reason we call this normal pool is AvgPoolxd and MaxPoolxd are taking the kernel size element from image,
|
The reason we call this normal pool is AvgPoolxd and MaxPoolxd are taking the kernel size element from image,
|
||||||
and reduce them depening on the operation type.
|
and reduce them depending on the operation type.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def validate(self) -> bool:
|
def validate(self) -> bool:
|
||||||
@ -35,9 +35,9 @@ class NormalPoolStrategyGenerator(StrategyGenerator):
|
|||||||
'''
|
'''
|
||||||
Compute the computation cost per device with this specific strategy.
|
Compute the computation cost per device with this specific strategy.
|
||||||
|
|
||||||
Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||||
'''
|
'''
|
||||||
# TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
|
# TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size.
|
||||||
# 1D: (Lout) * N * C * kernel
|
# 1D: (Lout) * N * C * kernel
|
||||||
# 2D: (H * W) * N * Cout * Cin * kernel
|
# 2D: (H * W) * N * Cout * Cin * kernel
|
||||||
# 3D: (H * W * D) * N * Cout * Cin * kernel
|
# 3D: (H * W * D) * N * Cout * Cin * kernel
|
||||||
|
@ -366,8 +366,8 @@ class TraceFlow(object):
|
|||||||
# find non chunk inputs
|
# find non chunk inputs
|
||||||
chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
|
chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx)
|
||||||
|
|
||||||
# reassgin reshape size, some size may have changed due to chunk
|
# reassign reshape size, some size may have changed due to chunk
|
||||||
chunk_info = self._reassgin_reshape_size(chunk_info)
|
chunk_info = self._reassign_reshape_size(chunk_info)
|
||||||
|
|
||||||
return chunk_info
|
return chunk_info
|
||||||
|
|
||||||
@ -428,10 +428,10 @@ class TraceFlow(object):
|
|||||||
chunk_info["outputs_dim"].append(output_dim)
|
chunk_info["outputs_dim"].append(output_dim)
|
||||||
return True
|
return True
|
||||||
|
|
||||||
def _reassgin_reshape_size(self, chunk_info):
|
def _reassign_reshape_size(self, chunk_info):
|
||||||
"""
|
"""
|
||||||
Some shape args in reshape may have changed due to chunk
|
Some shape args in reshape may have changed due to chunk
|
||||||
reassgin those changed shape
|
reassign those changed shape
|
||||||
"""
|
"""
|
||||||
chunk_region = chunk_info["region"]
|
chunk_region = chunk_info["region"]
|
||||||
reshape_size = {}
|
reshape_size = {}
|
||||||
|
@ -397,7 +397,7 @@ class TraceIndice(object):
|
|||||||
input_node = node.args[0]
|
input_node = node.args[0]
|
||||||
assert len(get_node_shape(input_node)) == 4
|
assert len(get_node_shape(input_node)) == 4
|
||||||
|
|
||||||
# assgin index
|
# assign index
|
||||||
self._assign_indice_as_input(node, node_idx, input_node)
|
self._assign_indice_as_input(node, node_idx, input_node)
|
||||||
self._del_dim(node_idx, 1)
|
self._del_dim(node_idx, 1)
|
||||||
self._add_dim(node_idx, 1)
|
self._add_dim(node_idx, 1)
|
||||||
@ -415,7 +415,7 @@ class TraceIndice(object):
|
|||||||
assert node.kwargs['size'] is None
|
assert node.kwargs['size'] is None
|
||||||
assert len(get_node_shape(node)) == 4
|
assert len(get_node_shape(node)) == 4
|
||||||
|
|
||||||
# assgin index
|
# assign index
|
||||||
self._assign_indice_as_input(node, node_idx)
|
self._assign_indice_as_input(node, node_idx)
|
||||||
self._mark_computation(node, node_idx, [-1, -2])
|
self._mark_computation(node, node_idx, [-1, -2])
|
||||||
|
|
||||||
|
@ -179,7 +179,7 @@ class GeminiPlugin(DPPluginBase):
|
|||||||
Users can provide this argument to speed up searching.
|
Users can provide this argument to speed up searching.
|
||||||
If users do not know this argument before training, it is ok. We will use a default value 1024.
|
If users do not know this argument before training, it is ok. We will use a default value 1024.
|
||||||
min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
|
min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte.
|
||||||
If the aggregate size of parameters is still samller than the minimum chunk size,
|
If the aggregate size of parameters is still smaller than the minimum chunk size,
|
||||||
all parameters will be compacted into one small chunk.
|
all parameters will be compacted into one small chunk.
|
||||||
memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
|
memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer.
|
||||||
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
|
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
|
||||||
|
@ -181,7 +181,7 @@ class DistCoordinator(metaclass=SingletonMeta):
|
|||||||
"""
|
"""
|
||||||
is_master = self.is_master(process_group)
|
is_master = self.is_master(process_group)
|
||||||
|
|
||||||
# define an inner functiuon
|
# define an inner function
|
||||||
def decorator(func):
|
def decorator(func):
|
||||||
|
|
||||||
@functools.wraps(func)
|
@functools.wraps(func)
|
||||||
|
@ -381,7 +381,7 @@ class AlphaBetaProfiler:
|
|||||||
first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
|
first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group)
|
||||||
second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
|
second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group)
|
||||||
mesh_alpha = [first_latency, second_latency]
|
mesh_alpha = [first_latency, second_latency]
|
||||||
# The beta values have been enlarged by 1e10 times temporarilly because the computation cost
|
# The beta values have been enlarged by 1e10 times temporarily because the computation cost
|
||||||
# is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
|
# is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future.
|
||||||
mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]
|
mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth]
|
||||||
|
|
||||||
|
@ -152,9 +152,9 @@ class PipelineSchedule(BaseSchedule):
|
|||||||
raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")
|
raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}")
|
||||||
|
|
||||||
def load_micro_batch(self):
|
def load_micro_batch(self):
|
||||||
mciro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
|
micro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset)
|
||||||
self.microbatch_offset += self.microbatch_size
|
self.microbatch_offset += self.microbatch_size
|
||||||
return self._move_to_device(mciro_batch_data)
|
return self._move_to_device(micro_batch_data)
|
||||||
|
|
||||||
def pre_processing(self, engine):
|
def pre_processing(self, engine):
|
||||||
from colossalai.zero.legacy import ShardedModelV2
|
from colossalai.zero.legacy import ShardedModelV2
|
||||||
|
@ -84,7 +84,7 @@ class PipelineScheduleV2(PipelineSchedule):
|
|||||||
'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
|
'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'
|
||||||
self.load_batch(data_iter)
|
self.load_batch(data_iter)
|
||||||
|
|
||||||
# num_warmup_microbatches is the step when not all the processers are working
|
# num_warmup_microbatches is the step when not all the processes are working
|
||||||
num_warmup_microbatches = \
|
num_warmup_microbatches = \
|
||||||
(gpc.get_world_size(ParallelMode.PIPELINE)
|
(gpc.get_world_size(ParallelMode.PIPELINE)
|
||||||
- gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
|
- gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
|
||||||
|
@ -523,7 +523,7 @@ def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func,
|
|||||||
# append code text to body
|
# append code text to body
|
||||||
for idx, node in enumerate(node_list):
|
for idx, node in enumerate(node_list):
|
||||||
# if this is the first node of the ckpt region
|
# if this is the first node of the ckpt region
|
||||||
# append the ckpt function defition
|
# append the ckpt function definition
|
||||||
if idx in start_idx:
|
if idx in start_idx:
|
||||||
label = start_idx.index(idx)
|
label = start_idx.index(idx)
|
||||||
ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
|
ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label])
|
||||||
|
@ -206,7 +206,7 @@ def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int):
|
|||||||
|
|
||||||
def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
|
def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int):
|
||||||
"""
|
"""
|
||||||
In avgnode_split_pass, simpliy split graph by node number.
|
In avgnode_split_pass, simply split graph by node number.
|
||||||
"""
|
"""
|
||||||
mod_graph = gm.graph
|
mod_graph = gm.graph
|
||||||
avg_num_node = len(mod_graph.nodes) // pp_size
|
avg_num_node = len(mod_graph.nodes) // pp_size
|
||||||
|
@ -16,7 +16,7 @@ def apply(*args, **kwargs):
|
|||||||
return shape_consistency_manager.apply(*args, **kwargs)
|
return shape_consistency_manager.apply(*args, **kwargs)
|
||||||
|
|
||||||
|
|
||||||
def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
|
def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
|
||||||
mod_graph = gm.graph
|
mod_graph = gm.graph
|
||||||
nodes = tuple(mod_graph.nodes)
|
nodes = tuple(mod_graph.nodes)
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ class TensorMetadata(NamedTuple):
|
|||||||
numel: int
|
numel: int
|
||||||
is_tensor: bool
|
is_tensor: bool
|
||||||
# TODO: we can add a list of sharding spec here, and record the sharding
|
# TODO: we can add a list of sharding spec here, and record the sharding
|
||||||
# behaviour by appending sharding spec into list.
|
# behavior by appending sharding spec into list.
|
||||||
|
|
||||||
|
|
||||||
def _extract_tensor_metadata(result: torch.Tensor) -> TensorMetadata:
|
def _extract_tensor_metadata(result: torch.Tensor) -> TensorMetadata:
|
||||||
|
@ -230,7 +230,7 @@ def split_module_for_gpt2_test(
|
|||||||
use_partition.partitions_dependent_on.setdefault(def_partition_name)
|
use_partition.partitions_dependent_on.setdefault(def_partition_name)
|
||||||
|
|
||||||
node_process_list = list(m.graph.nodes)
|
node_process_list = list(m.graph.nodes)
|
||||||
# split nodes into parititons
|
# split nodes into partitions
|
||||||
while node_process_list:
|
while node_process_list:
|
||||||
node = node_process_list.pop(0)
|
node = node_process_list.pop(0)
|
||||||
orig_nodes[node.name] = node
|
orig_nodes[node.name] = node
|
||||||
@ -277,7 +277,7 @@ def split_module_for_gpt2_test(
|
|||||||
if len(sorted_partitions) != len(partitions):
|
if len(sorted_partitions) != len(partitions):
|
||||||
raise RuntimeError("cycle exists between partitions!")
|
raise RuntimeError("cycle exists between partitions!")
|
||||||
|
|
||||||
# add placeholders to parititons
|
# add placeholders to partitions
|
||||||
for partition_name in sorted_partitions:
|
for partition_name in sorted_partitions:
|
||||||
partition = partitions[partition_name]
|
partition = partitions[partition_name]
|
||||||
for input in partition.inputs:
|
for input in partition.inputs:
|
||||||
|
@ -29,8 +29,8 @@ class Partition:
|
|||||||
f" nodes: {self.node_names},\n" \
|
f" nodes: {self.node_names},\n" \
|
||||||
f" inputs: {self.inputs},\n" \
|
f" inputs: {self.inputs},\n" \
|
||||||
f" outputs: {self.outputs},\n" \
|
f" outputs: {self.outputs},\n" \
|
||||||
f" partitions depenent on: {self.partitions_dependent_on},\n" \
|
f" partitions dependent on: {self.partitions_dependent_on},\n" \
|
||||||
f" parition dependents: {self.partition_dependents}"
|
f" partition dependents: {self.partition_dependents}"
|
||||||
|
|
||||||
|
|
||||||
# Creates subgraphs out of main graph
|
# Creates subgraphs out of main graph
|
||||||
|
Loading…
Reference in New Issue
Block a user