[tensor] refactor colo-tensor (#992)

* refactor colo-tensor and update linear op * polish code * polish code * update ops and unit tests * update unit tests * polish code * rename dist_spec module * polish code * polish code * remove unneeded import * fix pipelinable
2025-09-07 12:01:39 +00:00 · 2022-05-19 12:44:59 +08:00
parent 1467d83edf
commit ad536e308e
27 changed files with 657 additions and 616 deletions
--- a/colossalai/utils/model/colo_init_context.py
+++ b/colossalai/utils/model/colo_init_context.py
@@ -7,11 +7,13 @@ from torch import nn
 from typing import Iterator, Tuple, Union, Optional

 # find named_params includes replica
+
+
 def _named_params_with_replica(
-        module: nn.Module,
-        prefix: str = '',
-        recurse: bool = True,
-    ) -> Iterator[Tuple[str, Union[nn.Parameter, ColoTensor]]]:
+    module: nn.Module,
+    prefix: str = '',
+    recurse: bool = True,
+) -> Iterator[Tuple[str, Union[nn.Parameter, ColoTensor]]]:
    modules = module.named_modules(prefix=prefix) if recurse else [(prefix, module)]

    for mod_prefix, mod in modules:
@@ -21,11 +23,13 @@ def _named_params_with_replica(
            name = mod_prefix + ('.' if mod_prefix else '') + name
            yield name, val

+
 # Adapted from torch.nn.module.Module.register_param
+
+
 def _register_parameter_with_colotensor(self, name: str, param):
    if '_parameters' not in self.__dict__:
-        raise AttributeError(
-            "cannot assign parameter before Module.__init__() call")
+        raise AttributeError("cannot assign parameter before Module.__init__() call")

    if not isinstance(name, torch._six.string_classes):
        raise TypeError("parameter name should be a string. "
@@ -41,19 +45,21 @@ def _register_parameter_with_colotensor(self, name: str, param):
        self._parameters[name] = None
    elif not isinstance(param, (torch.nn.Parameter, ColoParameter)):
        raise TypeError("cannot assign '{}' object to parameter '{}' "
-                        "(torch.nn.Parameter or ColoParameter or None required)"
-                        .format(torch.typename(param), name))
+                        "(torch.nn.Parameter or ColoParameter or None required)".format(torch.typename(param), name))
    elif param.grad_fn:
-        raise ValueError(
-            "Cannot assign non-leaf Tensor to parameter '{0}'. Model "
-            "parameters must be created explicitly. To express '{0}' "
-            "as a function of another Tensor, compute the value in "
-            "the forward() method.".format(name))
+        raise ValueError("Cannot assign non-leaf Tensor to parameter '{0}'. Model "
+                         "parameters must be created explicitly. To express '{0}' "
+                         "as a function of another Tensor, compute the value in "
+                         "the forward() method.".format(name))
    else:
        self._parameters[name] = param

+
 # Adapted from torch.nn.module.Module.__setattr__
+
+
 def _setattr_with_colotensor(self, name: str, value: Union[torch.Tensor, torch.nn.Module, ColoTensor]):
+
    def remove_from(*dicts_or_sets):
        for d in dicts_or_sets:
            if name in d:
@@ -65,70 +71,45 @@ def _setattr_with_colotensor(self, name: str, value: Union[torch.Tensor, torch.n
    params = self.__dict__.get('_parameters')
    if isinstance(value, (ColoTensor, torch.nn.Parameter)):
        if params is None:
-            raise AttributeError(
-                "cannot assign parameters before Module.__init__() call")
+            raise AttributeError("cannot assign parameters before Module.__init__() call")
        remove_from(self.__dict__, self._buffers, self._modules, self._non_persistent_buffers_set)
        self.register_parameter(name, value)
    elif params is not None and name in params:
        if value is not None:
            raise TypeError("cannot assign '{}' as parameter '{}' "
-                            "(torch.nn.Parameter or None expected)"
-                            .format(torch.typename(value), name))
+                            "(torch.nn.Parameter or None expected)".format(torch.typename(value), name))
        self.register_parameter(name, value)
    else:
        modules = self.__dict__.get('_modules')
        if isinstance(value, torch.nn.Module):
            if modules is None:
-                raise AttributeError(
-                    "cannot assign module before Module.__init__() call")
+                raise AttributeError("cannot assign module before Module.__init__() call")
            remove_from(self.__dict__, self._parameters, self._buffers, self._non_persistent_buffers_set)
            modules[name] = value
        elif modules is not None and name in modules:
            if value is not None:
                raise TypeError("cannot assign '{}' as child module '{}' "
-                                "(torch.nn.Module or None expected)"
-                                .format(torch.typename(value), name))
+                                "(torch.nn.Module or None expected)".format(torch.typename(value), name))
            modules[name] = value
        else:
            buffers = self.__dict__.get('_buffers')
            if buffers is not None and name in buffers:
                if value is not None and not isinstance(value, torch.Tensor):
                    raise TypeError("cannot assign '{}' as buffer '{}' "
-                                    "(torch.Tensor or None expected)"
-                                    .format(torch.typename(value), name))
+                                    "(torch.Tensor or None expected)".format(torch.typename(value), name))
                buffers[name] = value
            else:
                object.__setattr__(self, name, value)

+
 def ColoModulize(module):
    """
    Replacing the parameters() and named_parameters() with our customized ones
    """

-    def fake_parameters(self, *args, **kargs):
-        for p in module.old_parameters(*args, **kargs):
-            if isinstance(p, ColoTensor):
-                yield p.torch_tensor()
-            elif isinstance(p, torch.Tensor):
-                yield p
-
-    def fake_named_parameters(self, *args, **kargs):
-        for name, p in module.old_named_parameters(*args, **kargs):
-            if isinstance(p, ColoTensor):
-                yield name, p.torch_tensor()
-            elif isinstance(p, torch.Tensor):
-                yield name, p
-
-    module.old_named_parameters = module.named_parameters
-    module.old_parameters = module.parameters
-
-    funcType = types.MethodType
-    module.parameters = funcType(fake_parameters, module)
-    module.named_parameters = funcType(fake_named_parameters, module)
-    module.colo_parameters = module.old_parameters
-    module.colo_named_parameters = module.old_named_parameters
    module._colo_visited = True

+
 class ColoInitContext(InsertPostInitMethodToModuleSubClasses):

    def __init__(self, lazy_memory_allocate: bool = False, device: torch.device = torch.device('cpu')):
@@ -159,15 +140,16 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
                continue

            split = name.rfind('.')
-            if split >= 0: # param in submodule
+            if split >= 0:    # param in submodule
                module_name = name[:split]
-                param_name = name[split+1:]
+                param_name = name[split + 1:]
            else:
-                module_name = '' # param in current module
+                module_name = ''    # param in current module
                param_name = name
            name_list.append((module_name, param_name))

-        replaced_tensors = dict() # record mapping between (torch.Tensor, ColoTensor) to distinguish the same reference
+        replaced_tensors = dict(
+        )    # record mapping between (torch.Tensor, ColoTensor) to distinguish the same reference
        for module_name, param_name in name_list:
            submodule = module.get_submodule(module_name)
            param = submodule.get_parameter(param_name)
@@ -177,13 +159,11 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
                save_torch_payload = True if not self._lazy_memory_allocate else False
                # detaching tensor is necessary for optimizers.
                requires_grad = param.requires_grad
-                tensor_detached = param.to(self._device).detach()
-                tensor_detached.requires_grad = requires_grad

-                colo_param = ColoParameter.init_from_torch_tensor(tensor=tensor_detached, save_payload=save_torch_payload)
+                colo_param = ColoParameter(param.to(self._device), requires_grad=requires_grad)
                # add mapping record
                replaced_tensors[param] = colo_param
            delattr(submodule, param_name)
            setattr(submodule, param_name, colo_param)

-        ColoModulize(module)
+        ColoModulize(module)
--- a/colossalai/utils/model/pipelinable.py
+++ b/colossalai/utils/model/pipelinable.py
@@ -83,7 +83,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):

        for name, param in name_list:
            delattr(module, name)
-            setattr(module, name, ColoTensor.init_from_torch_tensor(tensor=param, save_payload=False))
+            setattr(module, name, ColoTensor.from_torch_tensor(param))

    def to_layer_list(self, exec_seq=None):
        """
@@ -91,7 +91,7 @@ class PipelinableContext(InsertPostInitMethodToModuleSubClasses):
        If exec_seq is None, we will take the module initizing order as execution order.
        """
        if exec_seq is None:
-            #if user do not provide the model executing sequence, we use the initialization order as the executing order.
+            # if user do not provide the model executing sequence, we use the initialization order as the executing order.
            children_name = []
            for child in self._root_children:
                layer_spec = self._layer_spec_dict[id(child)]