diff --git a/.github/workflows/README.md b/.github/workflows/README.md index 8fc14e0d5..f40f4cc86 100644 --- a/.github/workflows/README.md +++ b/.github/workflows/README.md @@ -14,7 +14,7 @@ - [Compatibility Test on Dispatch](#compatibility-test-on-dispatch) - [Release](#release) - [User Friendliness](#user-friendliness) - - [Commmunity](#commmunity) + - [Community](#community) - [Configuration](#configuration) - [Progress Log](#progress-log) @@ -97,7 +97,7 @@ This workflow is triggered by manually dispatching the workflow. It has the foll | `Synchronize submodule` | `submodule.yml` | This workflow will check if any git submodule is updated. If so, it will create a PR to update the submodule pointers. | | `Close inactive issues` | `close_inactive.yml` | This workflow will close issues which are stale for 14 days. | -### Commmunity +### Community | Workflow Name | File name | Description | | -------------------------------------------- | -------------------------------- | -------------------------------------------------------------------------------- | diff --git a/colossalai/auto_parallel/passes/meta_info_prop.py b/colossalai/auto_parallel/passes/meta_info_prop.py index bc0960483..0673b767d 100644 --- a/colossalai/auto_parallel/passes/meta_info_prop.py +++ b/colossalai/auto_parallel/passes/meta_info_prop.py @@ -148,7 +148,7 @@ class MetaInfoProp: graph_info.fwd_tmp = buffer_tensors graph_info.fwd_out = output_tensors - # fetch other memory informations + # fetch other memory information memory_cost = meta_info.memory_cost graph_info.fwd_mem_tmp = memory_cost.fwd.temp graph_info.fwd_mem_out = memory_cost.fwd.activation diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py index 79b69acb2..416dc9c29 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py @@ -44,7 +44,7 @@ class BatchNormStrategyGenerator(StrategyGenerator): ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' # TODO: a constant coefficient need to be added. # 1D: (L) * N * Cin diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py index c2154b310..e605a68a3 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/conv_strategy_generator.py @@ -38,9 +38,9 @@ class ConvStrategyGenerator(StrategyGenerator): ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' - # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size. # 1D: (L) * N * Cout * Cin * kernel # 2D: (H * W) * N * Cout * Cin * kernel # 3D: (H * W * D) * N * Cout * Cin * kernel diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py index fbb6070f7..65b173bbf 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/layer_norm_generator.py @@ -34,9 +34,9 @@ class LayerNormGenerator(StrategyGenerator): ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' - # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size. # TODO: a constant coefficient need to be added. sharded_input_shape = strategy.sharding_specs[self.op_data['input']].get_sharded_shape_per_device() diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py index 9df6d2fbf..b7db42f8f 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/normal_pooling_generator.py @@ -17,7 +17,7 @@ class NormalPoolStrategyGenerator(StrategyGenerator): """ NormalPoolStrategyGenerator is a generic class to generate strategies for pool operation like MaxPoolxd. The reason we call this normal pool is AvgPoolxd and MaxPoolxd are taking the kernel size element from image, - and reduce them depening on the operation type. + and reduce them depending on the operation type. """ def validate(self) -> bool: @@ -35,9 +35,9 @@ class NormalPoolStrategyGenerator(StrategyGenerator): ''' Compute the computation cost per device with this specific strategy. - Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + Note: compute_cost need to be divided by TFLOPS, now it just shows the computation size. ''' - # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size. + # TODO: compute_cost need to be divided by TFLOPS, now it just shows the computation size. # 1D: (Lout) * N * C * kernel # 2D: (H * W) * N * Cout * Cin * kernel # 3D: (H * W * D) * N * Cout * Cin * kernel diff --git a/colossalai/autochunk/trace_flow.py b/colossalai/autochunk/trace_flow.py index 11a7e62ff..a1080fda1 100644 --- a/colossalai/autochunk/trace_flow.py +++ b/colossalai/autochunk/trace_flow.py @@ -366,8 +366,8 @@ class TraceFlow(object): # find non chunk inputs chunk_info = self._get_non_chunk_inputs(chunk_info, start_idx, end_idx) - # reassgin reshape size, some size may have changed due to chunk - chunk_info = self._reassgin_reshape_size(chunk_info) + # reassign reshape size, some size may have changed due to chunk + chunk_info = self._reassign_reshape_size(chunk_info) return chunk_info @@ -428,10 +428,10 @@ class TraceFlow(object): chunk_info["outputs_dim"].append(output_dim) return True - def _reassgin_reshape_size(self, chunk_info): + def _reassign_reshape_size(self, chunk_info): """ Some shape args in reshape may have changed due to chunk - reassgin those changed shape + reassign those changed shape """ chunk_region = chunk_info["region"] reshape_size = {} diff --git a/colossalai/autochunk/trace_indice.py b/colossalai/autochunk/trace_indice.py index 8e6cd3e29..fbe0741b8 100644 --- a/colossalai/autochunk/trace_indice.py +++ b/colossalai/autochunk/trace_indice.py @@ -397,7 +397,7 @@ class TraceIndice(object): input_node = node.args[0] assert len(get_node_shape(input_node)) == 4 - # assgin index + # assign index self._assign_indice_as_input(node, node_idx, input_node) self._del_dim(node_idx, 1) self._add_dim(node_idx, 1) @@ -415,7 +415,7 @@ class TraceIndice(object): assert node.kwargs['size'] is None assert len(get_node_shape(node)) == 4 - # assgin index + # assign index self._assign_indice_as_input(node, node_idx) self._mark_computation(node, node_idx, [-1, -2]) diff --git a/colossalai/booster/plugin/gemini_plugin.py b/colossalai/booster/plugin/gemini_plugin.py index bb3124642..adbf4803e 100644 --- a/colossalai/booster/plugin/gemini_plugin.py +++ b/colossalai/booster/plugin/gemini_plugin.py @@ -179,7 +179,7 @@ class GeminiPlugin(DPPluginBase): Users can provide this argument to speed up searching. If users do not know this argument before training, it is ok. We will use a default value 1024. min_chunk_size_mb (float, optional): the minimum chunk size in MegaByte. - If the aggregate size of parameters is still samller than the minimum chunk size, + If the aggregate size of parameters is still smaller than the minimum chunk size, all parameters will be compacted into one small chunk. memstats (MemStats, optional) the memory statistics collector by a runtime memory tracer. gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) diff --git a/colossalai/cluster/dist_coordinator.py b/colossalai/cluster/dist_coordinator.py index 99dde810e..3ee364ec3 100644 --- a/colossalai/cluster/dist_coordinator.py +++ b/colossalai/cluster/dist_coordinator.py @@ -181,7 +181,7 @@ class DistCoordinator(metaclass=SingletonMeta): """ is_master = self.is_master(process_group) - # define an inner functiuon + # define an inner function def decorator(func): @functools.wraps(func) diff --git a/colossalai/device/alpha_beta_profiler.py b/colossalai/device/alpha_beta_profiler.py index af2b10928..f8b20de9b 100644 --- a/colossalai/device/alpha_beta_profiler.py +++ b/colossalai/device/alpha_beta_profiler.py @@ -381,7 +381,7 @@ class AlphaBetaProfiler: first_latency, first_bandwidth = _extract_alpha_beta(first_axis, first_axis_process_group) second_latency, second_bandwidth = _extract_alpha_beta(second_axis, second_axis_process_group) mesh_alpha = [first_latency, second_latency] - # The beta values have been enlarged by 1e10 times temporarilly because the computation cost + # The beta values have been enlarged by 1e10 times temporarily because the computation cost # is still estimated in the unit of TFLOPs instead of time. We will remove this factor in future. mesh_beta = [1e10 / first_bandwidth, 1e10 / second_bandwidth] diff --git a/colossalai/engine/schedule/_pipeline_schedule.py b/colossalai/engine/schedule/_pipeline_schedule.py index 38175fe09..9fc301a26 100644 --- a/colossalai/engine/schedule/_pipeline_schedule.py +++ b/colossalai/engine/schedule/_pipeline_schedule.py @@ -152,9 +152,9 @@ class PipelineSchedule(BaseSchedule): raise TypeError(f"Expected data to be of type torch.Tensor, list, tuple, or dict, but got {type(data)}") def load_micro_batch(self): - mciro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset) + micro_batch_data = self._get_data_slice(self.batch_data, self.microbatch_offset) self.microbatch_offset += self.microbatch_size - return self._move_to_device(mciro_batch_data) + return self._move_to_device(micro_batch_data) def pre_processing(self, engine): from colossalai.zero.legacy import ShardedModelV2 diff --git a/colossalai/engine/schedule/_pipeline_schedule_v2.py b/colossalai/engine/schedule/_pipeline_schedule_v2.py index 28c58bd82..89e45c7aa 100644 --- a/colossalai/engine/schedule/_pipeline_schedule_v2.py +++ b/colossalai/engine/schedule/_pipeline_schedule_v2.py @@ -84,7 +84,7 @@ class PipelineScheduleV2(PipelineSchedule): 'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.' self.load_batch(data_iter) - # num_warmup_microbatches is the step when not all the processers are working + # num_warmup_microbatches is the step when not all the processes are working num_warmup_microbatches = \ (gpc.get_world_size(ParallelMode.PIPELINE) - gpc.get_local_rank(ParallelMode.PIPELINE) - 1) diff --git a/colossalai/fx/codegen/activation_checkpoint_codegen.py b/colossalai/fx/codegen/activation_checkpoint_codegen.py index 5a72cb9ca..33b164800 100644 --- a/colossalai/fx/codegen/activation_checkpoint_codegen.py +++ b/colossalai/fx/codegen/activation_checkpoint_codegen.py @@ -523,7 +523,7 @@ def emit_code_with_activation_checkpoint(body, ckpt_func, nodes, emit_node_func, # append code text to body for idx, node in enumerate(node_list): # if this is the first node of the ckpt region - # append the ckpt function defition + # append the ckpt function definition if idx in start_idx: label = start_idx.index(idx) ckpt_fn_def = _gen_ckpt_fn_def(label, input_vars[label]) diff --git a/colossalai/fx/passes/adding_split_node_pass.py b/colossalai/fx/passes/adding_split_node_pass.py index 2c7b842b5..245ba5d77 100644 --- a/colossalai/fx/passes/adding_split_node_pass.py +++ b/colossalai/fx/passes/adding_split_node_pass.py @@ -206,7 +206,7 @@ def avgcompute_split_pass(gm: torch.fx.GraphModule, pp_size: int): def avgnode_split_pass(gm: torch.fx.GraphModule, pp_size: int): """ - In avgnode_split_pass, simpliy split graph by node number. + In avgnode_split_pass, simply split graph by node number. """ mod_graph = gm.graph avg_num_node = len(mod_graph.nodes) // pp_size diff --git a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py index f28d65e26..4571bd93a 100644 --- a/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py +++ b/colossalai/fx/passes/experimental/adding_shape_consistency_pass.py @@ -16,7 +16,7 @@ def apply(*args, **kwargs): return shape_consistency_manager.apply(*args, **kwargs) -def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh): +def solution_annotation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh): mod_graph = gm.graph nodes = tuple(mod_graph.nodes) diff --git a/colossalai/fx/passes/meta_info_prop.py b/colossalai/fx/passes/meta_info_prop.py index 2b4a8749c..ab203dfd7 100644 --- a/colossalai/fx/passes/meta_info_prop.py +++ b/colossalai/fx/passes/meta_info_prop.py @@ -31,7 +31,7 @@ class TensorMetadata(NamedTuple): numel: int is_tensor: bool # TODO: we can add a list of sharding spec here, and record the sharding - # behaviour by appending sharding spec into list. + # behavior by appending sharding spec into list. def _extract_tensor_metadata(result: torch.Tensor) -> TensorMetadata: diff --git a/colossalai/fx/passes/passes_for_gpt2_test.py b/colossalai/fx/passes/passes_for_gpt2_test.py index abc1a089e..efdd34a01 100644 --- a/colossalai/fx/passes/passes_for_gpt2_test.py +++ b/colossalai/fx/passes/passes_for_gpt2_test.py @@ -230,7 +230,7 @@ def split_module_for_gpt2_test( use_partition.partitions_dependent_on.setdefault(def_partition_name) node_process_list = list(m.graph.nodes) - # split nodes into parititons + # split nodes into partitions while node_process_list: node = node_process_list.pop(0) orig_nodes[node.name] = node @@ -277,7 +277,7 @@ def split_module_for_gpt2_test( if len(sorted_partitions) != len(partitions): raise RuntimeError("cycle exists between partitions!") - # add placeholders to parititons + # add placeholders to partitions for partition_name in sorted_partitions: partition = partitions[partition_name] for input in partition.inputs: diff --git a/colossalai/fx/passes/split_module.py b/colossalai/fx/passes/split_module.py index 5ce5b969c..61ed037ab 100644 --- a/colossalai/fx/passes/split_module.py +++ b/colossalai/fx/passes/split_module.py @@ -29,8 +29,8 @@ class Partition: f" nodes: {self.node_names},\n" \ f" inputs: {self.inputs},\n" \ f" outputs: {self.outputs},\n" \ - f" partitions depenent on: {self.partitions_dependent_on},\n" \ - f" parition dependents: {self.partition_dependents}" + f" partitions dependent on: {self.partitions_dependent_on},\n" \ + f" partition dependents: {self.partition_dependents}" # Creates subgraphs out of main graph