[utils] fixed lazy init context (#1867)

2025-09-09 13:00:52 +00:00 · 2022-11-10 15:17:20 +08:00
parent 50c4cb0167
commit e6ec99d389
2 changed files with 35 additions and 25 deletions
--- a/colossalai/utils/model/lazy_init_context.py
+++ b/colossalai/utils/model/lazy_init_context.py
@@ -1,23 +1,24 @@
 #!/usr/bin/env python
 # coding: utf-8

+import inspect
+import types
+from typing import Callable, List
+
 import torch
 import torch.nn as nn
-from colossalai.tensor import ColoParameter, ColoTensor

-import types
-import inspect
-from typing import List, Callable
+from colossalai.tensor import ColoParameter, ColoTensor
 from colossalai.utils.model.utils import substitute_init_recursively


 class LazyInitContext():
    """
-    A context to allow for lazy weight initialization of PyTorch modules. It intercepts the tensor 
+    A context to allow for lazy weight initialization of PyTorch modules. It intercepts the tensor
    initialization functions for lazy initialization

    Note:
-        This API is only experimental and subject to future changes. 
+        This API is only experimental and subject to future changes.

    Usage:
        with LazyInitContext() as ctx:
@@ -30,19 +31,20 @@ class LazyInitContext():
        # initialize weights
        ctx.lazy_init_parameters(model)

-        # make sure the weight is not a meta tensor 
+        # make sure the weight is not a meta tensor
        # and initialized correctly
        assert not model.weight.is_meta and torch.all(model.weight == 0)

    Args:
-        to_meta (bool): optional, whether to initialize the model with meta tensors, default is False.
-        extra_torch_tensor_func (List[str]): extra torch tensor functions related 
+        to_meta (bool): optional, whether to initialize the model with meta tensors, default is True. This
+            argument exists for now because some corner cases such as self.weight = torch.zeros(...) cannot be captured yet.
+        extra_torch_tensor_func (List[str]): extra torch tensor functions related
            to value setting, such as `zero_` and `triu_`. `zero_` is pre-added by default.
    """

    tensor_set_value_func = ['zero_', 'fill_']

-    def __init__(self, to_meta: bool = False, extra_torch_tensor_func: List[str] = None):
+    def __init__(self, to_meta: bool = True, extra_torch_tensor_func: List[str] = None):
        # TODO: hijack the torch constructor functions as well
        self._to_meta = to_meta
        self._intercepted_nn_init_func_cache = {}
@@ -212,18 +214,19 @@ class LazyInitContext():
                materialized_tensor = torch.empty_like(tensor, device=device)
                # if this tensor is a meta tensor, it must have an init function
                assert tensor in self._intercepted_nn_init_func_cache
-                tensor = materialized_tensor
+            else:
+                materialized_tensor = tensor

            # apply init function
            if tensor in self._intercepted_nn_init_func_cache:
                init_func, args, kwargs = self._intercepted_nn_init_func_cache[tensor][-1]
-                init_func(tensor, *args, **kwargs)
+                init_func(materialized_tensor, *args, **kwargs)

            # convert it to ColoTensor or ColoParameter
            if is_param:
-                tensor = ColoParameter.from_torch_tensor(tensor, requires_grad=tensor.requires_grad)
+                tensor = ColoParameter.from_torch_tensor(materialized_tensor, requires_grad=tensor.requires_grad)
            else:
-                tensor = ColoTensor.from_torch_tensor(tensor)
+                tensor = ColoTensor.from_torch_tensor(materialized_tensor)

            # override the original tensor
            with torch.no_grad():