[autoparallel] fix forward memory calculation (#2062)

2025-09-13 05:01:44 +00:00 · 2022-12-04 15:00:16 +08:00
parent 44ea461890
commit 4b40fbd743
5 changed files with 29 additions and 24 deletions
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
@@ -106,15 +106,15 @@ def linear_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
        # calculate memory cost
        # NOTE: Linear don't have buffer and temp in forward and backward phase
        # the forward activation cost is the size of output_tensor, parameter cost is the size of weight_tensor and bias_tensor
-        fwd_memory_cost = MemoryCost(activation=activation_size(output_tensor),
-                                     parameter=activation_size(weight_tensor) + activation_size(bias_tensor),
+        # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
+        fwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, output_tensor]),
+                                     parameter=activation_size([weight_tensor, bias_tensor]),
                                     temp=0,
                                     buffer=0)

        # the backward activation cost is the size of input_tensor, weight_tensor and bias_tensor, parameter cost is 0
-        bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor) + activation_size(weight_tensor) +
-                                     activation_size(bias_tensor),
-                                     parameter=activation_size(weight_tensor) + activation_size(bias_tensor),
+        bwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, weight_tensor, bias_tensor]),
+                                     parameter=activation_size([weight_tensor, bias_tensor]),
                                     temp=0,
                                     buffer=0)

@@ -142,13 +142,14 @@ def linear_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, L
        # calculate memory cost
        # NOTE: Linear don't have buffer and temp in forward and backward phase
        # the forward activation cost is the size of output_tensor, parameter cost is the size of weight_tensor
+        # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
        fwd_memory_cost = MemoryCost(activation=activation_size(output_tensor),
                                     parameter=activation_size(weight_tensor),
                                     temp=0,
                                     buffer=0)

        # the backward activation cost is the size of input_tensor and weight_tensor, parameter cost is 0
-        bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor) + activation_size(weight_tensor),
+        bwd_memory_cost = MemoryCost(activation=activation_size([input_tensor, weight_tensor]),
                                     parameter=activation_size(weight_tensor),
                                     temp=0,
                                     buffer=0)