[autoparallel] resnet block runtime apply (#1709)

* [autoparallel] resnet block runtime apply * seperate buffer and parameter in MemoryCost * polish code * add comments and todos * fix test issue
2026-05-05 12:24:38 +00:00 · 2022-10-17 13:37:38 +08:00
parent b0a23dc4fc
commit 845ff4a47a
11 changed files with 277 additions and 27 deletions
--- a/colossalai/auto_parallel/tensor_shard/node_handler/batch_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/batch_norm_handler.py
@@ -36,7 +36,30 @@ class BatchNormModuleHandler(ModuleHandler):
                                               logical_shape=self.named_parameters['weight'].shape)
        physical_output = OperationData(name=str(self.node), type=OperationDataType.OUTPUT, data=self.node._meta_data)

-        mapping = {"input": physical_input_operand, "other": physical_other_operand, "output": physical_output}
+        physical_running_mean_operand = OperationData(name="running_mean",
+                                                      type=OperationDataType.BUFFER,
+                                                      data=self.named_buffers['running_mean'],
+                                                      logical_shape=self.named_buffers['running_mean'].shape)
+
+        physical_running_var_operand = OperationData(name="running_var",
+                                                     type=OperationDataType.BUFFER,
+                                                     data=self.named_buffers['running_var'],
+                                                     logical_shape=self.named_buffers['running_var'].shape)
+
+        physical_num_batches_tracked_operand = OperationData(
+            name="num_batches_tracked",
+            type=OperationDataType.BUFFER,
+            data=self.named_buffers['num_batches_tracked'],
+            logical_shape=self.named_buffers['num_batches_tracked'].shape)
+
+        mapping = {
+            "input": physical_input_operand,
+            "other": physical_other_operand,
+            "output": physical_output,
+            "running_mean": physical_running_mean_operand,
+            "running_var": physical_running_var_operand,
+            "num_batches_tracked": physical_num_batches_tracked_operand
+        }

        if self.named_parameters['bias'] is not None:
            physical_bias_operand = OperationData(name="bias",
--- a/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/node_handler.py
@@ -146,7 +146,10 @@ class ModuleHandler(NodeHandler):
            f'The graph is not associated with a module, please make sure it can be used to instantiate a GraphModule object.'
        module = self.node.graph.owning_module.get_submodule(self.node.target)
        named_parameters = list(module.named_parameters(recurse=False))
+        named_buffers = list(module.named_buffers(recurse=False))
        # convert named parameters from list to dict
        named_parameters = {k: v for k, v in named_parameters}
+        named_buffers = {k: v for k, v in named_buffers}
        self.module = module
        self.named_parameters = named_parameters
+        self.named_buffers = named_buffers
--- a/colossalai/auto_parallel/tensor_shard/node_handler/reshape_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/reshape_handler.py
@@ -13,6 +13,7 @@ __all__ = ['ReshapeHandler']
@operator_registry.register(torch.reshape)
@operator_registry.register(torch.flatten)
@operator_registry.register(torch.Tensor.permute)
+@operator_registry.register(torch.nn.AdaptiveAvgPool2d)
 class ReshapeHandler(NodeHandler):
    """
    A ReshapeHandler which deals with the sharding strategies for Reshape Op, such as torch.reshape.
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/batch_norm_generator.py
@@ -64,7 +64,9 @@ class BatchNormStrategyGenerator(StrategyGenerator):
        forward_size_mapping = {
            'input': self._compute_size_in_bytes(strategy, "input"),
            'other': self._compute_size_in_bytes(strategy, "other"),
-            'output': self._compute_size_in_bytes(strategy, "output")
+            'output': self._compute_size_in_bytes(strategy, "output"),
+            'running_mean': self._compute_size_in_bytes(strategy, "running_mean"),
+            'running_var': self._compute_size_in_bytes(strategy, "running_var"),
        }

        if self.has_bias:
@@ -75,24 +77,27 @@ class BatchNormStrategyGenerator(StrategyGenerator):
        backward_size_mapping.pop("output")
        # compute fwd cost incurred
        # fwd_cost = input + other + bias + output
-        fwd_activation_cost = sum([v for k, v in forward_size_mapping.items() if not self.is_param(k)])
+        fwd_activation_cost = sum(
+            [v for k, v in forward_size_mapping.items() if not self.is_param(k) and not self.is_buffer(k)])
        fwd_parameter_cost = sum([v for k, v in forward_size_mapping.items() if self.is_param(k)])
-        fwd_mem_cost = MemoryCost(activation=fwd_activation_cost, parameter=fwd_parameter_cost)
+        fwd_buffer_cost = sum([v for k, v in forward_size_mapping.items() if self.is_buffer(k)])
+        fwd_mem_cost = MemoryCost(activation=fwd_activation_cost, parameter=fwd_parameter_cost, buffer=fwd_buffer_cost)

        # compute bwd cost incurred
        # bwd_cost = input_grad + other_grad + bias_grad
-        bwd_activation_cost = sum([v for k, v in backward_size_mapping.items() if not self.is_param(k)])
+        bwd_activation_cost = sum(
+            [v for k, v in backward_size_mapping.items() if not self.is_param(k) and not self.is_buffer(k)])
        bwd_parameter_cost = sum([v for k, v in backward_size_mapping.items() if self.is_param(k)])
        bwd_mem_cost = MemoryCost(activation=bwd_activation_cost, parameter=bwd_parameter_cost)

        # compute total cost
        total_mem_cost = MemoryCost(activation=fwd_activation_cost + bwd_activation_cost,
-                                    parameter=fwd_parameter_cost + bwd_parameter_cost)
+                                    parameter=fwd_parameter_cost + bwd_parameter_cost,
+                                    buffer=fwd_buffer_cost)
        memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
        strategy.memory_cost = memory_cost

    def split_input_channel(self, mesh_dim_0):
-        strategy_list = []
        name = f'RS{mesh_dim_0} = RS{mesh_dim_0} x S{mesh_dim_0}'
        dim_partition_dict_mapping = {
            "input": {
@@ -104,6 +109,13 @@ class BatchNormStrategyGenerator(StrategyGenerator):
            "output": {
                1: [mesh_dim_0]
            },
+            "running_mean": {
+                0: [mesh_dim_0]
+            },
+            "running_var": {
+                0: [mesh_dim_0]
+            },
+            "num_batches_tracked": {},
        }
        if self.has_bias:
            dim_partition_dict_mapping["bias"] = {0: [mesh_dim_0]}
@@ -128,6 +140,13 @@ class BatchNormStrategyGenerator(StrategyGenerator):
            "output": {
                1: [mesh_dim_0, mesh_dim_1]
            },
+            "running_mean": {
+                0: [mesh_dim_0, mesh_dim_1]
+            },
+            "running_var": {
+                0: [mesh_dim_0, mesh_dim_1]
+            },
+            "num_batches_tracked": {},
        }
        if self.has_bias:
            dim_partition_dict_mapping["bias"] = {0: [mesh_dim_0, mesh_dim_1]}
@@ -146,6 +165,9 @@ class BatchNormStrategyGenerator(StrategyGenerator):
            "input": {},
            "other": {},
            "output": {},
+            "running_mean": {},
+            "running_var": {},
+            "num_batches_tracked": {},
        }
        if self.has_bias:
            dim_partition_dict_mapping["bias"] = {}
@@ -168,6 +190,9 @@ class BatchNormStrategyGenerator(StrategyGenerator):
            "output": {
                0: [mesh_dim_0]
            },
+            "running_mean": {},
+            "running_var": {},
+            "num_batches_tracked": {},
        }
        if self.has_bias:
            dim_partition_dict_mapping["bias"] = {}
@@ -199,6 +224,9 @@ class BatchNormStrategyGenerator(StrategyGenerator):
            "output": {
                0: [mesh_dim_0, mesh_dim_1]
            },
+            "running_mean": {},
+            "running_var": {},
+            "num_batches_tracked": {},
        }
        if self.has_bias:
            dim_partition_dict_mapping["bias"] = {}
@@ -234,6 +262,13 @@ class BatchNormStrategyGenerator(StrategyGenerator):
                0: [mesh_dim_0],
                1: [mesh_dim_1],
            },
+            "running_mean": {
+                0: [mesh_dim_1],
+            },
+            "running_var": {
+                0: [mesh_dim_1],
+            },
+            "num_batches_tracked": {},
        }
        if self.has_bias:
            dim_partition_dict_mapping["bias"] = {
@@ -273,16 +308,22 @@ class BatchNormStrategyGenerator(StrategyGenerator):
        # RS01 = RS01 x S01
        strategy_list.append(self.split_input_channel_1d(0, 1))

+        # The strategies with SYNC_BN are temporarily commented,
+        # because it requires some additional passes to keep runtime
+        # computation correctness.
+
+        # TODO: The strategies below should be uncommented after runtime
+        # passes ready.
        # SR = SR x R WITH SYNC_BN
-        strategy_list.append(self.split_input_batch(0))
-        strategy_list.append(self.split_input_batch(1))
+        # strategy_list.append(self.split_input_batch(0))
+        # strategy_list.append(self.split_input_batch(1))

        # SS = SS x S WITH SYNC_BN
-        strategy_list.append(self.split_input_both_dim(0, 1))
-        strategy_list.append(self.split_input_both_dim(1, 0))
+        # strategy_list.append(self.split_input_both_dim(0, 1))
+        # strategy_list.append(self.split_input_both_dim(1, 0))

        # S01R = S01R x R WITH SYNC_BN
-        strategy_list.append(self.split_input_batch_1d(0, 1))
+        # strategy_list.append(self.split_input_batch_1d(0, 1))

        for strategy in strategy_list:
            self.update_communication_cost(strategy)
--- a/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/strategy/strategy_generator.py
@@ -35,6 +35,10 @@ class StrategyGenerator(ABC):
        other_data = self.op_data[op_data_name]
        return other_data.type == OperationDataType.PARAM

+    def is_buffer(self, op_data_name):
+        other_data = self.op_data[op_data_name]
+        return other_data.type == OperationDataType.BUFFER
+
    def get_sharding_strategy(self, name: str, sharding_spec_mapping: Dict[str, ShardingSpec],
                              communication_action_mapping: Dict[str, CommSpec]):
        """
--- a/colossalai/auto_parallel/tensor_shard/sharding_strategy.py
+++ b/colossalai/auto_parallel/tensor_shard/sharding_strategy.py
@@ -20,7 +20,8 @@ class OperationDataType(Enum):
    INPUT = 0
    ARG = 1
    PARAM = 2
-    OUTPUT = 3
+    BUFFER = 3
+    OUTPUT = 4


@dataclass
@@ -80,6 +81,7 @@ class MemoryCost:
    """
    activation: int = 0
    parameter: int = 0
+    buffer: int = 0


@dataclass
--- a/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/cost_graph.py
@@ -1,4 +1,5 @@
 from colossalai.auto_parallel.tensor_shard.constants import INFINITY_COST
+import torch


 class CostGraph:
@@ -51,7 +52,6 @@ class CostGraph:
                if src_node not in self.nodes:
                    continue
                node_pair = (src_node, dst_node)
-                # src_index = strategies_vector.predecessor_nodes.index(src_node)
                edge_cost = {}
                for i in range(len(strategies_vector)):
                    for j in range(len(src_node.strategies_vector)):
@@ -62,10 +62,12 @@ class CostGraph:
                            edge_cost[(j, i)] = resharding_cost_item.total
                self.edge_costs[node_pair] = edge_cost
            # add parents and children attribute to node
-            setattr(dst_node, 'parents', strategies_vector.predecessor_nodes)
-            setattr(dst_node, 'children', strategies_vector.successor_nodes)
-            self._remove_invalid_node(dst_node, 'parents')
-            self._remove_invalid_node(dst_node, 'children')
+            parent_nodes = [node for node in strategies_vector.predecessor_nodes]
+            children_nodes = [node for node in strategies_vector.successor_nodes]
+            setattr(dst_node, 'parents', parent_nodes)
+            setattr(dst_node, 'children', children_nodes)
+            # self._remove_invalid_node(dst_node, 'parents')
+            # self._remove_invalid_node(dst_node, 'children')

            if self.simplify and strategies_vector.check_merge():
                for followed_node in strategies_vector.predecessor_nodes:
--- a/colossalai/auto_parallel/tensor_shard/solver/solver.py
+++ b/colossalai/auto_parallel/tensor_shard/solver/solver.py
@@ -169,10 +169,7 @@ class Solver:
                else:
                    communication_costs.append(origin_communication_cost)
                memory_costs.append(memory_cost)
-                # if isinstance(memory_cost, tuple):
-                #     memory_costs.append(memory_cost[0])
-                # else:
-                #     memory_costs.append(memory_cost)
+
        compute_costs = np.array(compute_costs)
        communication_costs = np.array(communication_costs)
        memory_costs = np.array(memory_costs)
--- a/colossalai/fx/passes/experimental/adding_shape_consistency_pass_v2.py
+++ b/colossalai/fx/passes/experimental/adding_shape_consistency_pass_v2.py
@@ -36,16 +36,19 @@ def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], de
            for name, param in target_module.named_parameters():
                origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
                setattr(param, 'sharding_spec', origin_sharding_spec)
-                target_weight_sharding_spec = node.best_strategy.get_sharding_spec_by_name(name)
-                apply(param, target_weight_sharding_spec)
+                target_sharding_spec = node.best_strategy.get_sharding_spec_by_name(name)
+                apply(param, target_sharding_spec)
+
+            for name, buffer in target_module.named_buffers():
+                origin_sharding_spec = ShardingSpec(device_mesh, buffer.shape, {})
+                setattr(buffer, 'sharding_spec', origin_sharding_spec)
+                target_sharding_spec = node.best_strategy.get_sharding_spec_by_name(name)
+                apply(buffer, target_sharding_spec)

    # the dict to get input sharding specs of user node
    sharding_spec_convert_dict = {}
    for index, node in enumerate(nodes):
        target_sharding_specs = []
-        if node.name == 'bn1':
-            print(node.strategies_vector.successor_nodes)
-            assert False
        for user_node in node.strategies_vector.successor_nodes:
            # node_index = user_node.strategies_vector.predecessor_nodes.index(node)
            # target_sharding_spec = user_node.best_strategy.input_shardings[node_index]