[doc] update rst and docstring (#1351)

* update rst * add zero docstr * fix docstr * remove fx.tracer.meta_patch * fix docstr * fix docstr * update fx rst * fix fx docstr * remove useless rst
2025-09-17 07:00:37 +00:00 · 2022-07-21 15:54:53 +08:00
parent 274c1a3b5f
commit d068af81a3
132 changed files with 724 additions and 146 deletions
--- a/colossalai/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/amp/torch_amp/_grad_scaler.py
@@ -60,7 +60,7 @@ class GradScaler(object):
    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
    * ``scaler.update()`` updates ``scaler``'s scale factor.

-    Example::
+    Example:

        # Creates a GradScaler once at the beginning of training.
        scaler = GradScaler()
--- a/colossalai/context/random/_helper.py
+++ b/colossalai/context/random/_helper.py
@@ -102,10 +102,10 @@ def sync_states():
 def seed(parallel_mode: ParallelMode):
    """ A context for seed switch

-    Examples::
+    Examples:

-        with seed(ParallelMode.DATA):
-            output = F.dropout(input)
+        >>> with seed(ParallelMode.DATA):
+        >>>     output = F.dropout(input)

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -124,18 +124,18 @@ def with_seed(func, parallel_mode: ParallelMode):
    """
    A function wrapper which executes the function with a specified seed.

-    Examples::
+    Examples:

-        # use with decorator
-        @with_seed(ParallelMode.DATA)
-        def forward(input):
-            return F.dropout(input)
-        out = forward(input)
-        # OR use it inline
-        def forward(input):
-            return F.dropout(input)
-        wrapper_forward = with_seed(forward, ParallelMode.DATA)
-        out = wrapped_forward(input)
+        >>> # use with decorator
+        >>> @with_seed(ParallelMode.DATA)
+        >>> def forward(input):
+        >>>     return F.dropout(input)
+        >>> out = forward(input)
+        >>> # OR use it inline
+        >>> def forward(input):
+        >>>     return F.dropout(input)
+        >>> wrapper_forward = with_seed(forward, ParallelMode.DATA)
+        >>> out = wrapped_forward(input)

    Note:
        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
--- a/colossalai/fx/proxy.py
+++ b/colossalai/fx/proxy.py
@@ -12,7 +12,8 @@ class ColoProxy(Proxy):
    ColoProxy is a proxy class which uses meta tensor to handle data-dependent control flow. The original torch.fx proxy
    cannot be used to infer the condition statement, with this proxy, torch.fx can still run even with if statements.

-    Usage:
+    Example::
+
        proxy = tracer.create_proxy(...)
        proxy.meta_data = torch.empty(4, 2, device='meta')
        print(len(proxy)) # expect output 4
--- a/colossalai/fx/tracer/tracer.py
+++ b/colossalai/fx/tracer/tracer.py
@@ -35,7 +35,8 @@ class ColoTracer(Tracer):
    ColoTracer is a symbolic tracer designed to support dynamic control flow by using meta tensors for the `colossalai.fx` module.
    This tracer is initialized in the same way as the original torch.fx.Tracer.

-    Usage:
+    Usage::
+
        class Model(nn.Module):
            def __init__(self):
                super().__init__()
@@ -254,8 +255,8 @@ class ColoTracer(Tracer):
        non_meta_arg_names = sig_names - meta_arg_names
        for k, v in sig.parameters.items():
            if k in non_meta_arg_names and \
-                k not in concrete_args and \
-                v.default is not inspect.Parameter.empty:
+                    k not in concrete_args and \
+                    v.default is not inspect.Parameter.empty:
                concrete_args[k] = v.default

        # get non concrete arg names
--- a/colossalai/gemini/gemini_mgr.py
+++ b/colossalai/gemini/gemini_mgr.py
@@ -13,6 +13,14 @@ class GeminiManager:

    PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
    https://arxiv.org/abs/2108.05818
+
+    Args:
+        placement_policy (str): Which device to place *held* tensors. It can be 'cpu', 'cuda' and 'auto'.
+            If it's 'cpu', parameters, gradients and optimizer states will be offloaded to CPU, which means min CUDA memory will be used.
+            If it's 'cuda', they won't be offloaded, which means max CUDA memory will be used.
+            If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
+            Note that 'auto' policy can only work well when no other processes use CUDA during your training.
+        chunk_manager (ChunkManager): A ``ChunkManager`` instance.
    """

    def __init__(self, placement_policy: str, chunk_manager: ChunkManager) -> None:
--- a/colossalai/nn/parallel/data_parallel.py
+++ b/colossalai/nn/parallel/data_parallel.py
@@ -40,7 +40,7 @@ def _cast_float(args, dtype: torch.dtype):
 class ColoDDP(torch.nn.Module):
    """Distributed data parallel for ColoTensor. Nested ColoDDP is not supported now.

-    Example::
+    Example:
        >>> from colossalai.core import global_context as gpc
        >>> from colossalai.context import ParallelMode
        >>> model = torch.nn.Linear(20, 1)
@@ -148,7 +148,7 @@ class ColoDDP(torch.nn.Module):
        """Sets parameters to be ignored by DDP.
        This method must be called before initializing ColoDDP.

-        Example::
+        Example:
            >>> params_to_ignore = []
            >>> for p in module.parameters():
            >>>     if should_ignore(p):
@@ -174,7 +174,7 @@ class ZeroDDP(ColoDDP):
    We can configure chunk and gemini via ChunkManager and GeminiManager respectively.
    For more details, see the API reference of ``ChunkManager`` and ``GeminiManager``.

-    Example::
+    Example:
        >>> model = torch.nn.Linear(20, 1)
        >>> placement_policy = 'cuda'
        >>> chunk_size = ChunkManager.search_chunk_size(model, search_range, n_grids) if use_chunk else None
@@ -283,7 +283,7 @@ class ZeroDDP(ColoDDP):
            dict:
                a dictionary containing a whole state of the module

-        Example::
+        Example:

            >>> module.state_dict().keys()
            ['bias', 'weight']
--- a/colossalai/tensor/op_wrapper.py
+++ b/colossalai/tensor/op_wrapper.py
@@ -21,7 +21,7 @@ def colo_op_impl(func):
    by ``__torch_function__`` dispatch and has a ColoTensor as any of its
    parameters, the function provided will be invoked for that operator.

-    Example::
+    Example:
        >>> @colo_op_impl(torch.nn.functional.linear)
        >>> def my_custom_linear(types, args, kwargs, process_group):
        >>>   ....
--- a/colossalai/tensor/param_op_hook.py
+++ b/colossalai/tensor/param_op_hook.py
@@ -41,7 +41,7 @@ class ParamOpHookManager:
    def use_hooks(*hooks: ParamOpHook):
        """Change the param op hooks you use. Nested calling is allowed.

-        Example::
+        Example:
            >>> with ParamOpHookManager.use_hooks(*hooks):
            >>>     do_something()
            >>>     with ParamOpHookManager.use_hooks():
--- a/colossalai/tensor/utils.py
+++ b/colossalai/tensor/utils.py
@@ -26,7 +26,7 @@ def named_params_with_colotensor(
        (string, Union[Tensor, ColoTensor]): Tuple containing
            the name and parameter (or ColoTensor parameter)

-    Example::
+    Example:

        >>> model = torch.nn.Linear(*linear_size)
        >>> delattr(model.weight)
--- a/colossalai/zero/zero_optimizer.py
+++ b/colossalai/zero/zero_optimizer.py
@@ -16,6 +16,30 @@ class OptimState(Enum):


 class ZeroOptimizer(ColossalaiOptimizer):
+    """A wrapper for optimizer. ``ZeroDDP`` and ``ZeroOptimizer`` implement Zero Redundancy Optimizer (ZeRO state-3).
+
+    Note:
+        You must use ``ZeroDDP`` with ``ZeroOptimizer``.
+
+    Note:
+        Make sure you set ``placement_policy`` of ``GeminiManager`` to `"auto"`,
+        if you set ``gpu_margin_mem_ratio > 0``.
+
+    Args:
+        optim (Optimizer): An Optimizer instance.
+        module (ZeroDDP): A ``ZeroDDP`` instance.
+        gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward) 
+            which will be used when using hybrid CPU optimizer. 
+            This argument is meaningless when `placement_policy` of `GeminiManager` is not "auto".
+            Defaults to 0.0.
+        initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
+        min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
+        growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
+        backoff_factor (float, optional): backoff_factor used by DynamicGradScaler. Defaults to 0.5.
+        growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
+        hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
+        max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
+        """

    def __init__(self,
                 optim: Optimizer,