[fx] add balanced policy v2 (#1251)

* [CLI] add CLI launcher * Revert "[CLI] add CLI launcher" This reverts commit df7e6506d4. * [fx] add balanced policy v2 * add unittest
2025-09-01 17:17:05 +00:00 · 2022-07-15 14:54:26 +08:00
parent ca2d3f284f
commit e8acf55e8b
3 changed files with 54 additions and 3 deletions
--- a/colossalai/fx/passes/adding_split_node_pass.py
+++ b/colossalai/fx/passes/adding_split_node_pass.py
@@ -10,7 +10,9 @@ def pipe_split():


 def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):
-    # TODO(lyl): balanced policy V2, split module by node size(weight+bias+output)
+    """
+    In balanced_split_pass, we split module by the size of parameters(weights+bias).
+    """
    mod_graph = gm.graph
    total_param_amount = 0
    for param in mod_graph.owning_module.parameters():
@@ -39,6 +41,36 @@ def balanced_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    return gm


+def balanced_split_pass_v2(gm: torch.fx.GraphModule, pp_size: int):
+    """
+    In balanced_split_pass_v12, we split module by the size of nodes(weights+bias+outputs).
+    """
+    mod_graph = gm.graph
+    # To use balanced_split_pass_v2, we need run meta_info_prop interpreter first.
+    # If nodes don't have meta info, this pass will fall back to normal balanced split pass.
+    check_node = list(mod_graph.nodes)[0]
+    if 'tensor_meta' not in check_node.meta:
+        return balanced_split_pass(gm, pp_size)
+
+    total_element_size = 0
+    for node in mod_graph.nodes:
+        total_element_size += node.node_size
+
+    partition_size = total_element_size // pp_size
+    accumulate_node_size = 0
+    for node in mod_graph.nodes:
+        if pp_size <= 1:
+            break
+        accumulate_node_size += node.node_size
+        if accumulate_node_size >= partition_size:
+            accumulate_node_size = 0
+            pp_size -= 1
+            with mod_graph.inserting_after(node):
+                split_node = mod_graph.create_node('call_function', pipe_split)
+    gm.recompile()
+    return gm
+
+
 def uniform_split_pass(gm: torch.fx.GraphModule, pp_size: int):
    mod_graph = gm.graph
    valid_children_size = 0