diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py index ab391ebfa..d3d09a9dc 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py @@ -75,7 +75,7 @@ class NodeHandler(ABC): prev_strategy.get_sharding_spec_by_name(node_name) for prev_strategy in prev_strategy_vector ] - # create data structrure to store costs + # create data structure to store costs if node not in resharding_costs: resharding_costs[node] = [] diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py index 1f3812429..79b69acb2 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py @@ -24,7 +24,7 @@ class BatchNormStrategyGenerator(StrategyGenerator): To keep the math consistency, there are two way to do BatchNorm if the input shards on batch dimension: 1. We gather the input partitions through batch dimension, then do the normal BatchNorm. - 2. We do the SyncBatchNorm on the each input partition seperately, the SyncBN op will help + 2. We do the SyncBatchNorm on the each input partition separately, the SyncBN op will help us to keep the computing correctness. In this generator, both methods will be considered. """ @@ -212,7 +212,7 @@ class BatchNormStrategyGenerator(StrategyGenerator): # set communication action # For SyncBN case, we don't need to do communication for weight and bias. - # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation + # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation # to SyncBN operation instead of inserting a communication node. output_comm_action = self.get_communication_action( sharding_spec=sharding_spec_mapping["output"], @@ -250,7 +250,7 @@ class BatchNormStrategyGenerator(StrategyGenerator): # set communication action # For SyncBN case, we don't need to do communication for gradients of weight and bias. - # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation + # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation # to SyncBN operation instead of inserting a communication node. output_comm_action = self.get_communication_action( sharding_spec=sharding_spec_mapping["output"], @@ -298,7 +298,7 @@ class BatchNormStrategyGenerator(StrategyGenerator): # set communication action # For SyncBN case, we don't need to do communication for gradients of weight and bias. - # TODO: the communication happens interally at SyncBN operation. We need to replace the BN operation + # TODO: the communication happens internally at SyncBN operation. We need to replace the BN operation # to SyncBN operation instead of inserting a communication node. output_comm_action = self.get_communication_action( sharding_spec=sharding_spec_mapping["output"], diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py index fd7f811c8..d27cc046e 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/binary_elementwise_generator.py @@ -51,7 +51,7 @@ class BinaryElementwiseStrategyGenerator(StrategyGenerator): # compute fwd memory cost in bytes # as the elementwise ops are not memory-intensive - # we approximate the fwd memroy cost to be the output + # we approximate the fwd memory cost to be the output # and the backward memory cost to be grad of input and other input_bytes = self._compute_size_in_bytes(strategy, 'input') other_bytes = self._compute_size_in_bytes(strategy, 'other') diff --git a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py index 6d68521aa..d42429745 100644 --- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py +++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py @@ -225,7 +225,7 @@ class StrategyGenerator(ABC): if isinstance(meta_data, torch.Tensor): element_bytes = _compute_size_in_bytes_helper(sharding_spec, meta_data) else: - # if meta_data is not a tensor, we count the memroy as 0 + # if meta_data is not a tensor, we count the memory as 0 element_bytes = 0 total_bytes += element_bytes @@ -233,7 +233,7 @@ class StrategyGenerator(ABC): if isinstance(op_data.data, torch.Tensor): total_bytes = _compute_size_in_bytes_helper(strategy.sharding_specs[op_data], op_data.data) else: - # if op_data.data is not a tensor, we count the memroy as 0 + # if op_data.data is not a tensor, we count the memory as 0 total_bytes = 0 return total_bytes diff --git a/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py b/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py index 74290453c..1b2d3ad57 100644 --- a/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py +++ b/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py @@ -9,7 +9,7 @@ class CostGraph: 1. To feed the quadratic resharding costs into solver, we need to linearize it. We build edge_cost in CostGraph, and it stored every combinations of strategies for a src-dst node pair in an 1D list. 2. To reduce the searching space, we merge computationally-trivial operators, such as - element-wise operators, transpose, and reduction, into their following nodes. The merging infomation will + element-wise operators, transpose, and reduction, into their following nodes. The merging information will be given by the StrategiesVector depending on the type of target node and following nodes. Argument: @@ -90,7 +90,7 @@ class CostGraph: if self.simplify and strategies_vector.check_merge(): for followed_node in strategies_vector.predecessor_nodes: # we only merge node pairs which src node has a tensor element inside. - # This is necessay because the node without a tensor element inside will not + # This is necessary because the node without a tensor element inside will not # be assigned any strategy. if _check_tensor_in_node(followed_node._meta_data): self.merge_pair.append((followed_node, dst_node)) diff --git a/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py b/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py index be39a74cb..171aa8b33 100644 --- a/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py +++ b/colossalai/auto_parallel/tensor_shard/solver/graph_analysis.py @@ -83,7 +83,7 @@ class GraphAnalyser: def liveness_analysis(self) -> List[LiveStage]: """ - Analyse the graph to obtain the variable liveness information. This function returns + Analyses the graph to obtain the variable liveness information. This function returns an ordered dictionary where the key is the compute stage ID and the value is a LivenessStage object. """ compute_nodes = self.graph.nodes @@ -91,7 +91,7 @@ class GraphAnalyser: # checked: record all variables created since the first stage # all: record the live variables only exist until the current stage. - # this can be different from the `checked list`` as some varialbes may be destroyed prior to this stage. + # this can be different from the `checked list`` as some variables may be destroyed prior to this stage. # unique: record the unique live variables only exist until the current stage. # this is different from `all list` as some variables are duplicated. checked_variables = LiveVariableVector() @@ -103,7 +103,7 @@ class GraphAnalyser: # find new living variables # ############################# # detect whether the current op is an in-place op - # if it is an in-place op, we would deem it as a duplciate var + # if it is an in-place op, we would deem it as a duplicate var is_inplace = False if node.op == 'call_function': # check if this is an inplace op such as torch.nn.functional.relu(x, inplace=True) diff --git a/colossalai/auto_parallel/tensor_shard/solver/solver.py b/colossalai/auto_parallel/tensor_shard/solver/solver.py index f5c6663dc..564c5f092 100644 --- a/colossalai/auto_parallel/tensor_shard/solver/solver.py +++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py @@ -44,7 +44,7 @@ class Solver: graph: The computing graph to be optimized. strategies_constructor: It will provide all the possible strategies for each node in the computing graph. cost_graph: A graph data structure to simplify the edge cost graph. - graph_analyser: graph_analyser will analyse the graph to obtain the variable liveness information, which will be used to generate memory constraints. + graph_analyser: graph_analyser will analyses the graph to obtain the variable liveness information, which will be used to generate memory constraints. memory_budget: Memory constraint for the solution. solution_numbers: If solution_numbers is larger than one, solver will us a serious of solutions based on different memory budget. memory_increasing_coefficient: If solution_numbers is larger than one, we will use this coefficient to generate new memory budget. diff --git a/colossalai/testing/pytest_wrapper.py b/colossalai/testing/pytest_wrapper.py index a472eb372..b264b0090 100644 --- a/colossalai/testing/pytest_wrapper.py +++ b/colossalai/testing/pytest_wrapper.py @@ -33,7 +33,7 @@ def run_on_environment_flag(name: str): assert isinstance(name, str) flag = os.environ.get(name.upper(), '0') - reason = f'Environment varialbe {name} is {flag}' + reason = f'Environment variable {name} is {flag}' if flag == '1': return pytest.mark.skipif(False, reason=reason) else: