[docs] updatad docs of hybrid adam and cpu adam (#552)

2025-09-05 02:51:59 +00:00 · 2022-03-30 18:14:59 +08:00
parent 014bac0c49
commit c44d797072
5 changed files with 105 additions and 14 deletions
--- a/colossalai/nn/optimizer/hybrid_adam.py
+++ b/colossalai/nn/optimizer/hybrid_adam.py
@@ -1,7 +1,55 @@
 import torch
-from colossalai.utils import multi_tensor_applier

+
+from colossalai.utils import multi_tensor_applier
+from colossalai.registry import OPTIMIZERS
+
+
+@OPTIMIZERS.register_module
 class HybridAdam(torch.optim.Optimizer):
+    """Implements Adam algorithm.
+
+    Supports parameters updating on both GPU and CPU, depanding on the device of paramters.
+    But the parameters and gradients should on the same device: 
+      * Parameters on CPU and gradients on CPU is allowed.
+      * Parameters on GPU and gradients on GPU is allowed.
+      * Parameters on GPU and gradients on CPU is **not** allowed.
+    
+    Requires ColossalAI to be installed via ``pip install .``
+
+    This version of Hybrid Adam is an hybrid of CPUAdam and FusedAdam.
+      * For parameters updating on CPU, it uses CPUAdam.
+      * For parameters updating on GPU, it uses FusedAdam.
+      * Hybird precision calculation of fp16 and fp32 is supported, eg fp32 parameters and fp16 gradients.
+
+    :class:`colossalai.nn.optimizer.HybridAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
+    or ``torch.optim.Adam`` with ``adamw_mode=False``
+
+    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
+
+    Arguments:
+        model_params (iterable): iterable of parameters of dicts defining
+            parameter groups.
+        lr (float, optional): learning rate. (default: 1e-3)
+        betas (Tuple[float, float], optional): coefficients used for computing
+            running averages of gradient and its square. (default: (0.9, 0.999))
+        eps (float, optional): term added to the denominator to improve
+            numerical stability. (default: 1e-8)
+        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
+        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
+            algorithm from the paper `On the Convergence of Adam and Beyond`_
+            (default: False) NOT SUPPORTED yet in CPUAdam!
+        adamw_mode (boolean, optional): Apply L2 regularization or weight decay
+            True for decoupled weight decay(also known as AdamW) (default: True)
+        simd_log (boolean, optional): whether to show if you are using SIMD to 
+            accelerate. (default: False)
+    
+    .. _Adam: A Method for Stochastic Optimization:
+        https://arxiv.org/abs/1412.6980
+    .. _On the Convergence of Adam and Beyond:
+        https://openreview.net/forum?id=ryQu7f-RZ
+    """
+
    optimizer_id = 0
    # Number of fp32 shards for per parameter
    # Param weight, grad, momentum and variance
@@ -16,11 +64,6 @@ class HybridAdam(torch.optim.Optimizer):
                 weight_decay=0,
                 adamw_mode=True,
                 simd_log=False):
-        """
-        An implementation equivalent to `torch.optim.Adam`.
-        The difference is that model_params are sharded parameters belonging to a ShardedModelV2 instance.
-        The sharded param of model_params can resident on both CPU and CUDA(fused adam).
-        """

        default_args = dict(lr=lr, betas=betas, eps=eps, weight_decay=weight_decay, bias_correction=bias_correction)
        super(HybridAdam, self).__init__(model_params, default_args)