[fx] fix the false interpretation of algorithm 3 in https://arxiv.org/abs/1604.06174. (#1446)

* [fx] modify the calculation of node_size in MetaInfoProp for activation checkpointing usages * [fx] modify the calculation of node_size in MetaInfoProp for activation checkpointing usages * [fx] modify the calculation of node_size in MetaInfoProp for activation checkpointing usages * [fx] activation checkpointing using Chen strategies. * [fx] add test for ckpt_solver_chen * mend * [fx] add vanilla activation checkpoint search with test on resnet and densenet * [fx] add vanilla activation checkpoint search with test on resnet and densenet * [fx] add a namespace code for solver_chen. * [fx] fix the false interpretation of algorithm 3 in https://arxiv.org/abs/1604.06174. * [fx] fix lowercase naming conventions.
2025-09-05 11:02:05 +00:00 · 2022-08-12 11:28:50 +08:00
parent 821c6172e2
commit d40a9392ba
2 changed files with 62 additions and 22 deletions
--- a/colossalai/fx/passes/algorithms/ckpt_solver_chen.py
+++ b/colossalai/fx/passes/algorithms/ckpt_solver_chen.py
@@ -1,45 +1,71 @@
+from typing import Set, Tuple
 import torch
 from torch.fx import GraphModule
+import math

 __all__ = ['chen_greedy', 'chen_sqrtn']


-def chen_greedy(gm: GraphModule, B: int):
+def chen_greedy(gm: GraphModule) -> GraphModule:
    """
    This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
+    Note that this algorithm targets at memory optimization only, using techniques in appendix A.

    Usage:
-        B = 5 * 1024 * 1024 * 1024  # An approximate memory budget of 5GB
        model = resnet18()
        input_sample = torch.rand(4, 3, 224, 224)
        gm = symbolic_trace(model)
        MetaInfoProp(gm).run(input_sample)
-        gm = chen_greedy(gm, B)
+        gm = chen_greedy(gm)

    Args:
        gm (GraphModule): The module to add checkpoints
-        B (int): The approximate memory budget for this module.
    """
+
+    def grid_search(num_grids: int = 6) -> Set:
+        """
+        Search ckpt strategy with b = 0, then run the allocation algorithm again with b = √xy.
+        Grid search over [√2/2 b, √2 b] for ckpt_opt over num_grids as in appendix A.
+        """
+        _, b_approx = run_chen_greedy(0)
+        b_min, b_max = math.floor(b_approx / math.sqrt(2)), math.ceil(b_approx * math.sqrt(2))
+        b_opt = math.inf
+        for b in range(b_min, b_max, (b_max - b_min) // num_grids):
+            ckpt, b_approx = run_chen_greedy(b)
+            if b_approx < b_opt:
+                b_opt = b_approx
+                ckpt_opt = ckpt
+        return ckpt_opt
+
+    def run_chen_greedy(b: int = 0) -> Tuple[Set, int]:
+        """
+        This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
+        """
+        ckpt = set()
+        temp = 0
+        x = 0
+        y = 0
+        for (idx, n) in enumerate(gm.graph.nodes):
+            temp += getattr(n, 'activation_size')
+            y = max(y, temp)
+            if temp > b:
+                x += getattr(n, 'activation_size')
+                temp = 0
+                ckpt.add(idx)
+        return ckpt, math.floor(math.sqrt(x * y))
+
    gm.graph.lint()    # make sure nodes are in topological order
-    temp = 0
-    x = 0
-    idx = 0
-    budget = B
-    for n in gm.graph.nodes:
-        B -= getattr(n, 'param_size')
-        assert B > 0, f'The memory budget {budget / 1024 ** 3:.2f} GB is not enough for model parameters of {gm}'
-    for n in gm.graph.nodes:
-        temp += getattr(n, 'activation_size')
-        if temp > B:
-            x += getattr(n, 'activation_size')
-            temp = x
-            setattr(n, 'activation_checkpoint', str(idx))
-            idx += 1
+    ckpt = grid_search(num_grids=6)
+    i = 0
+    for idx, n in enumerate(gm.graph.nodes):
+        if idx in ckpt:
+            setattr(n, 'activation_checkpoint', str(i))
+            i += 1
    gm.recompile()
    return gm


-def chen_sqrtn(gm: GraphModule):
+def chen_sqrtn(gm: GraphModule) -> GraphModule:
    """
    This is the theoretical optimal strategy in https://arxiv.org/abs/1604.06174.