[autoparallel] add bias addtion function class (#2098)

* [autoparallel] add bias addtion function class * polish code * polish
2025-09-02 01:28:31 +00:00 · 2022-12-08 11:31:51 +08:00
parent 3af7e65dea
commit b175e6d58e
5 changed files with 216 additions and 33 deletions
--- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_function/init.py
+++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_function/init.py
@@ -0,0 +1,2 @@
+from .addmm import Addmm
+from .bias_addition_function import BiasAdditionFunc, LinearBasedBiasFunc, func_to_func_dict
--- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_function/addmm.py
+++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_function/addmm.py
@@ -0,0 +1,76 @@
+import operator
+
+import torch
+import torch.nn.functional as F
+
+from ...registry import bias_addition_function
+from .bias_addition_function import LinearBasedBiasFunc
+
+
+@bias_addition_function.register(torch.addmm)
+class Addmm(LinearBasedBiasFunc):
+
+    def extract_kwargs_from_origin_func(self):
+        kwargs = {}
+        if 'beta' in self.kwargs:
+            kwargs['beta'] = self.kwargs['beta']
+        if 'alpha' in self.kwargs:
+            kwargs['alpha'] = self.kwargs['alpha']
+        return kwargs
+
+    def coefficent_for_addmm(self, input_proxy, coefficent):
+        """
+        This method is used to create a coefficent node for the numerical correctness.
+        The formula for torch.addmm is out = beta * input + alpha * (m1 @ m2)
+        Therefore, we need to use this method insert two more operator.mul nodes for
+        the computation graph to compute the final result.
+        """
+        node_kind = 'call_function'
+        node_target = operator.mul
+        node_args = (
+            input_proxy,
+            coefficent,
+        )
+        node_kwargs = {}
+        mul_proxy = self.tracer.create_proxy(node_kind, node_target, node_args, node_kwargs)
+        return mul_proxy
+
+    def transpose_other_operand_for_linear(self, other_proxy):
+        '''
+        This method is used to transpose the other operand for linear function.
+        For example:
+            input = torch.rand(3, 4)
+            m1 = torch.rand(3, 5)
+            m2 = torch.rand(5, 4)
+            original_output = torch.addmm(input, m1, m2)
+            # To keep the computation graph consistent with the origin computation graph, we need to transpose the m2
+            # before we call the linear function.
+            new_output = torch.linear(m1, m2.transpose(0, 1)) + input
+        '''
+        node_kind = 'call_function'
+        node_target = torch.transpose
+        node_args = (other_proxy, 0, 1)
+        node_kwargs = {}
+        transpose_proxy = self.tracer.create_proxy(node_kind, node_target, node_args, node_kwargs)
+        return transpose_proxy
+
+    def generate(self):
+        transpose_proxy = self.transpose_other_operand_for_linear(self.args[2])
+        non_bias_linear_func_proxy = self.create_non_bias_func_proxy(self.args[1], transpose_proxy)
+        kwargs = self.extract_kwargs_from_origin_func()
+
+        if 'beta' in kwargs:
+            beta = kwargs['beta']
+            beta_proxy = self.coefficent_for_addmm(self.args[0], beta)
+        else:
+            beta_proxy = self.args[0]
+
+        if 'alpha' in kwargs:
+            alpha = kwargs['alpha']
+            alpha_proxy = self.coefficent_for_addmm(alpha, non_bias_linear_func_proxy)
+        else:
+            alpha_proxy = non_bias_linear_func_proxy
+
+        bias_addition_proxy = self.create_bias_addition_proxy(alpha_proxy, beta_proxy)
+
+        return bias_addition_proxy
--- a/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_function/bias_addition_function.py
+++ b/colossalai/fx/tracer/bias_addition_patch/patched_bias_addition_function/bias_addition_function.py
@@ -0,0 +1,91 @@
+import operator
+from abc import ABC, abstractmethod
+
+import torch
+import torch.nn.functional as F
+
+
+class BiasAdditionFunc(ABC):
+    """
+    This class is used to construct the restructure computation graph for
+    call_func node with bias addition inside.
+    """
+
+    def __init__(self, tracer, target, args, kwargs, substitute_func):
+        self.tracer = tracer
+        self.target = target
+        self.args = args
+        self.kwargs = kwargs
+        self.substitute_func = substitute_func
+
+    @abstractmethod
+    def extract_kwargs_from_origin_func(self):
+        """
+        This method is used to extract the kwargs for further graph transform.
+
+        For example:
+            The formula for torch.addmm is out = beta * input + alpha * (m1 @ m2)
+            The kwargs for addmm function is {beta=1, alpha=1, output=None}, then we need
+            to insert two more operator.mul nodes for the computation graph to compute the
+            final result.
+        """
+        pass
+
+    @abstractmethod
+    def generate(self):
+        """
+        This method is used to construct the whole restructure computation graph for call_func node with bias
+        addition inside.
+
+        A whole restructure computation graph will contain a weight node, a bias node, a non-bias addition computation node,
+        a bias reshape node if needed and a bias addition node.
+
+        Use torch.addmm as an example:
+        The origin node is:
+            %addmm: call_func[target=torch.addmm](args = (%input_1, m1, m2), kwargs = {beta=1, alpha=1})
+        Restructured graph is:
+            %transpose : [#users=1] = call_function[target=torch.transpose](args = (%m2, 0, 1), kwargs = {})
+            %linear : [#users=1] = call_function[target=torch._C._nn.linear](args = (%m1, %transpose), kwargs = {})
+            %mul : [#users=1] = call_function[target=operator.mul](args = (%input_1, 3), kwargs = {})
+            %mul_1 : [#users=1] = call_function[target=operator.mul](args = (2, %linear), kwargs = {})
+            %add : [#users=1] = call_function[target=operator.add](args = (%mul_1, %mul), kwargs = {})
+        """
+        pass
+
+
+class LinearBasedBiasFunc(BiasAdditionFunc):
+    """
+    This class is used to construct the restructure computation graph for
+    call_func node based on F.linear.
+    """
+
+    def create_non_bias_func_proxy(self, input_proxy, other_proxy):
+        """
+        This method is used to create the non_bias_func proxy, the node created by this proxy will
+        compute the main computation, such as convolution, with bias option banned.
+        """
+        assert self.substitute_func == torch.nn.functional.linear
+        node_kind = 'call_function'
+        node_target = self.substitute_func
+
+        node_args = (input_proxy, other_proxy)
+        # non-bias linear does not have any kwargs
+        node_kwargs = {}
+        non_bias_func_proxy = self.tracer.create_proxy(node_kind, node_target, node_args, node_kwargs)
+        return non_bias_func_proxy
+
+    def create_bias_addition_proxy(self, non_bias_func_proxy, bias_proxy):
+        """
+        This method is used to create the bias_addition_proxy, the node created by this proxy will
+        compute the sum of non_bias_func result and bias with some reshape operation if needed.
+        """
+        bias_add_node_kind = 'call_function'
+        bias_add_node_target = operator.add
+        bias_add_args = (non_bias_func_proxy, bias_proxy)
+        bias_add_proxy = self.tracer.create_proxy(bias_add_node_kind, bias_add_node_target, tuple(bias_add_args), {})
+        return bias_add_proxy
+
+
+func_to_func_dict = {
+    torch.addmm: F.linear,
+}
--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@@ -20,7 +20,7 @@ from torch.fx.proxy import ParameterProxy, Proxy

 from ..proxy import ColoProxy
 from ._tracer_utils import compute_meta_data_for_functions_proxy, extract_meta, is_element_in_list
-from .bias_addition_patch import module_to_func_dict
+from .bias_addition_patch import func_to_func_dict, module_to_func_dict
 from .registry import bias_addition_function, bias_addition_module, meta_patched_function, meta_patched_module

 __all__ = ['ColoTracer']
@@ -96,7 +96,8 @@ class ColoTracer(Tracer):
        handle = None
        if kind == "call_function":
            if bias_addition_function.has(target):
-                handle = bias_addition_function.get(target)(self, target, args, kwargs)
+                function_to_substitute = func_to_func_dict[target]
+                handle = bias_addition_function.get(target)(self, target, args, kwargs, function_to_substitute)
            elif bias_addition_function.has(target.__name__):
                # use name for some builtin op like @ (matmul)
                handle = bias_addition_function.get(target.__name__)(self, target, args, kwargs)