[zero] add constant placement policy (#1705)

* fixes memory leak when paramter is in fp16 in ZeroDDP init. * bans chunk releasement in CUDA. Only when a chunk is about to offload, it is allowed to release. * adds a constant placement policy. With it, users can allocate a reserved caching memory space for parameters.
2025-09-08 12:30:42 +00:00 · 2022-10-14 17:53:16 +08:00
parent 5f41463a76
commit 1468e4bcfc
11 changed files with 117 additions and 57 deletions
--- a/colossalai/utils/model/colo_init_context.py
+++ b/colossalai/utils/model/colo_init_context.py
@@ -33,7 +33,10 @@ def ColoModulize(module):

 class ColoInitContext(InsertPostInitMethodToModuleSubClasses):

-    def __init__(self, lazy_memory_allocate: bool = False, device: torch.device = torch.device('cpu')):
+    def __init__(self,
+                 lazy_memory_allocate: bool = False,
+                 device: torch.device = torch.device('cpu'),
+                 dtype: torch.dtype = torch.float):
        """
        Args:
            lazy_memory_allocate (bool, optional): whether to allocate memory for the parameter tensors. Defaults to False.
@@ -42,6 +45,7 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
        super().__init__()
        self._lazy_memory_allocate = lazy_memory_allocate
        self._device = device
+        self._dtype = dtype

        self._register_colo_modules()

@@ -87,7 +91,8 @@ class ColoInitContext(InsertPostInitMethodToModuleSubClasses):
                # detaching tensor is necessary for optimizers.
                requires_grad = param.requires_grad
                # TODO(jiaruifang) we initialize a Default PG memory
-                colo_param = ColoParameter(param.to(self._device), requires_grad=requires_grad)
+                colo_param = ColoParameter(param.to(device=self._device, dtype=self._dtype),
+                                           requires_grad=requires_grad)
                # add mapping record
                replaced_tensors[param] = colo_param
            delattr(submodule, param_name)