[autoparallel] where_handler_v2 (#1688)

* where generator * [autoparallel] where_handler_v2
2025-09-16 14:41:53 +00:00 · 2022-10-13 11:02:22 +08:00
parent 31d2f03d27
commit 319d654f79
5 changed files with 274 additions and 2 deletions
--- a/colossalai/auto_parallel/solver/op_handler/where_handler_v2.py
+++ b/colossalai/auto_parallel/solver/op_handler/where_handler_v2.py
@@ -0,0 +1,87 @@
+import torch
+from .node_handler import NodeHandler
+from ..sharding_strategy import ShardingStrategy_V2, OperationDataType, OperationData, StrategiesVector
+from ..strategy import WhereGenerator, StrategyGenerator_V2
+from .broadcast import recover_sharding_spec_for_broadcast_shape
+from typing import List, Dict
+from .registry import operator_registry
+import operator
+import copy
+
+__all__ = ['WhereHandler']
+
+
+@operator_registry.register(torch.where)
+class WhereHandler(NodeHandler):
+    """
+    A WhereHandler which deals with the sharding strategies for torch.where.
+    """
+
+    def get_strategy_generator(self) -> List[StrategyGenerator_V2]:
+        logical_op_data_mapping, _ = self.get_operation_data_mapping()
+        generators = []
+        generators.append(WhereGenerator(logical_op_data_mapping, self.device_mesh))
+        return generators
+
+    def get_operation_data_mapping(self) -> Dict[str, OperationData]:
+        # use transposed shape for strategies
+        # the strategies will be transformed back to its original shape in self.post_process
+        physical_condition_operand = OperationData(name=str(self.node.args[0]),
+                                                   type=OperationDataType.ARG,
+                                                   data=self.node.args[0]._meta_data)
+        physical_x_operand = OperationData(name=str(self.node.args[1]),
+                                           type=OperationDataType.ARG,
+                                           data=self.node.args[1]._meta_data)
+        physical_y_operand = OperationData(name=str(self.node.args[2]),
+                                           type=OperationDataType.ARG,
+                                           data=self.node.args[2]._meta_data)
+        physical_output = OperationData(name=str(self.node), type=OperationDataType.OUTPUT, data=self.node._meta_data)
+        physical_mapping = {
+            "condition": physical_condition_operand,
+            "x": physical_x_operand,
+            "y": physical_y_operand,
+            "output": physical_output
+        }
+        logical_shape_for_all = self.node._meta_data.shape
+        logical_mapping = {}
+        for key, physical_operand in physical_mapping.items():
+            logical_mapping[key] = self.convert_physical_operand_to_logical_operand(physical_operand,
+                                                                                    logical_shape_for_all)
+
+        return logical_mapping, physical_mapping
+
+    def convert_physical_operand_to_logical_operand(self, physical_operand, target_shape):
+        logical_operand = copy.deepcopy(physical_operand)
+        logical_operand.logical_shape = target_shape
+        return logical_operand
+
+    def register_strategy(self, compute_resharding_cost: bool = False) -> StrategiesVector:
+        """
+        Register different sharding strategies for the current node.
+        """
+        strategy_generators = self.get_strategy_generator()
+
+        for generator in strategy_generators:
+            strategies = generator.generate()
+            strategies_vector = map(self.post_process, strategies)
+            # compute the resharding costs based on the previous node
+            # strategies if specified
+            if compute_resharding_cost:
+                strategies = list(map(self.update_resharding_cost, strategies))
+            self.strategies_vector.extend(strategies)
+
+        self.strategies_vector = list(strategies_vector)
+        return self.strategies_vector
+
+    def post_process(self, strategy: ShardingStrategy_V2):
+        logical_op_data_mapping, physical_op_data_mapping = self.get_operation_data_mapping()
+        for key in logical_op_data_mapping.keys():
+            logical_sharding_spec = strategy.sharding_specs[logical_op_data_mapping[key]]
+            logical_shape = logical_op_data_mapping[key].logical_shape
+            physical_shape = physical_op_data_mapping[key].logical_shape
+            physical_sharding_spec = recover_sharding_spec_for_broadcast_shape(logical_sharding_spec, logical_shape,
+                                                                               physical_shape)
+            strategy.sharding_specs.pop(logical_op_data_mapping[key])
+            strategy.sharding_specs[physical_op_data_mapping[key]] = physical_sharding_spec
+        strategy.name = f"{strategy.sharding_specs[physical_op_data_mapping['output']].sharding_sequence} = {strategy.sharding_specs[physical_op_data_mapping['condition']].sharding_sequence} x {strategy.sharding_specs[physical_op_data_mapping['x']].sharding_sequence} x {strategy.sharding_specs[physical_op_data_mapping['y']].sharding_sequence}"
+        return strategy
--- a/colossalai/auto_parallel/solver/strategy/init.py
+++ b/colossalai/auto_parallel/solver/strategy/init.py
@@ -5,11 +5,12 @@ from .batch_norm_generator import BatchNormStrategyGenerator
 from .unary_elementwise_generator import UnaryElementwiseGenerator
 from .getitem_generator import GetItemStrategyGenerator, TensorStrategyGenerator, TensorTupleStrategyGenerator
 from .layer_norm_generator import LayerNormGenerator
+from .where_generator import WhereGenerator
 from .reshape_generator import ReshapeGenerator

 __all__ = [
    'StrategyGenerator_V2', 'DotProductStrategyGenerator', 'MatVecStrategyGenerator',
    'LinearProjectionStrategyGenerator', 'BatchedMatMulStrategyGenerator', 'ConvStrategyGenerator',
    'UnaryElementwiseGenerator', 'BatchNormStrategyGenerator', 'GetItemStrategyGenerator', 'TensorStrategyGenerator',
-    'TensorTupleStrategyGenerator', 'LayerNormGenerator', 'ReshapeGenerator'
+    'TensorTupleStrategyGenerator', 'LayerNormGenerator', "WhereGenerator", 'ReshapeGenerator'
 ]
--- a/colossalai/auto_parallel/solver/strategy/layer_norm_generator.py
+++ b/colossalai/auto_parallel/solver/strategy/layer_norm_generator.py
@@ -163,7 +163,7 @@ class LayerNormGenerator(StrategyGenerator_V2):

    def generate(self):
        '''
-        Generate every possible strategies for a BatchNorm node, and record all strategies into the strategies_vector.
+        Generate every possible strategies for a LayerNorm node, and record all strategies into the strategies_vector.
        '''
        strategy_list = []
        input_data_dim = len(self.op_data["input"].logical_shape)
--- a/colossalai/auto_parallel/solver/strategy/where_generator.py
+++ b/colossalai/auto_parallel/solver/strategy/where_generator.py
@@ -0,0 +1,99 @@
+import operator
+from functools import reduce
+from ..sharding_strategy import ShardingStrategy_V2, TrainCycleItem, MemoryCost
+from colossalai.tensor.shape_consistency import CollectiveCommPattern
+from .strategy_generator import StrategyGenerator_V2, FollowingStrategyGenerator
+from typing import List
+from .._utils import exception_handler, enumerate_all_possible_1d_sharding, enumerate_all_possible_2d_sharding
+import copy
+
+__all__ = ['WhereGenerator']
+
+
+class WhereGenerator(StrategyGenerator_V2):
+    """
+    WhereGenerator is a generic class to generate strategies for Where operation.
+    """
+
+    def validate(self) -> bool:
+        return super().validate()
+
+    def update_compute_cost(self, strategy: ShardingStrategy_V2):
+        compute_cost = TrainCycleItem(fwd=10, bwd=10, total=20)
+        strategy.compute_cost = compute_cost
+
+    def update_memory_cost(self, strategy: ShardingStrategy_V2):
+        '''
+        Compute the memory cost per device with this specific strategy.
+        '''
+        forward_size_mapping = {
+            'condition': self._compute_size_in_bytes(strategy, "condition"),
+            'x': self._compute_size_in_bytes(strategy, "x"),
+            'y': self._compute_size_in_bytes(strategy, "y"),
+            'output': self._compute_size_in_bytes(strategy, "output")
+        }
+
+        backward_size_mapping = copy.deepcopy(forward_size_mapping)
+        backward_size_mapping.pop("output")
+        # compute fwd cost incurred
+        # fwd_cost = condition + x + y + output
+        fwd_activation_cost = sum([v for k, v in forward_size_mapping.items()])
+        fwd_mem_cost = MemoryCost(activation=fwd_activation_cost, parameter=0)
+
+        # compute bwd cost incurred
+        # bwd_cost = condition_grad + x_grad + y_grad
+        bwd_activation_cost = sum([v for k, v in backward_size_mapping.items()])
+        bwd_mem_cost = MemoryCost(activation=bwd_activation_cost, parameter=0)
+
+        # compute total cost
+        total_mem_cost = MemoryCost(activation=fwd_activation_cost + bwd_activation_cost, parameter=0)
+        memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
+        strategy.memory_cost = memory_cost
+
+    def _generate_strategy_with_dim_partition(self, dim_partition):
+        dim_partition_dict_mapping = {
+            "condition": dim_partition,
+            "x": dim_partition,
+            "y": dim_partition,
+            "output": dim_partition
+        }
+
+        sharding_spec_mapping = self.to_sharding_spec_mapping(dim_partition_dict_mapping)
+
+        name = f'{sharding_spec_mapping["output"].sharding_sequence} = {sharding_spec_mapping["condition"].sharding_sequence} x {sharding_spec_mapping["x"].sharding_sequence} x {sharding_spec_mapping["y"].sharding_sequence}'
+        communication_action_mapping = {}
+
+        strategy = self.get_sharding_strategy(name=name,
+                                              sharding_spec_mapping=sharding_spec_mapping,
+                                              communication_action_mapping=communication_action_mapping)
+
+        return strategy
+
+    def enumerate_all_possible_output_spec(self, mesh_dim_0, mesh_dim_1, dimension_length):
+        dim_partition_list = []
+        dim_partition_list.extend(enumerate_all_possible_1d_sharding(mesh_dim_0, dimension_length))
+        dim_partition_list.extend(enumerate_all_possible_1d_sharding(mesh_dim_1, dimension_length))
+        dim_partition_list.extend(enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dimension_length))
+        # append {} for non_split case
+        dim_partition_list.append({})
+
+        return dim_partition_list
+
+    def generate(self):
+        '''
+        Generate every possible strategies for a where node, and record all strategies into the strategies_vector.
+        '''
+        strategy_list = []
+
+        dimension_length = len(self.op_data["output"].logical_shape)
+        dim_partition_list = self.enumerate_all_possible_output_spec(0, 1, dimension_length)
+        for dim_partition in dim_partition_list:
+            strategy = self._generate_strategy_with_dim_partition(dim_partition)
+            strategy_list.append(strategy)
+
+        for strategy in strategy_list:
+            self.update_communication_cost(strategy)
+            self.update_compute_cost(strategy)
+            self.update_memory_cost(strategy)
+
+        return strategy_list