mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-17 07:00:37 +00:00
[doc] update rst and docstring (#1351)
* update rst * add zero docstr * fix docstr * remove fx.tracer.meta_patch * fix docstr * fix docstr * update fx rst * fix fx docstr * remove useless rst
This commit is contained in:
@@ -60,7 +60,7 @@ class GradScaler(object):
|
||||
* ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
|
||||
* ``scaler.update()`` updates ``scaler``'s scale factor.
|
||||
|
||||
Example::
|
||||
Example:
|
||||
|
||||
# Creates a GradScaler once at the beginning of training.
|
||||
scaler = GradScaler()
|
||||
|
@@ -102,10 +102,10 @@ def sync_states():
|
||||
def seed(parallel_mode: ParallelMode):
|
||||
""" A context for seed switch
|
||||
|
||||
Examples::
|
||||
Examples:
|
||||
|
||||
with seed(ParallelMode.DATA):
|
||||
output = F.dropout(input)
|
||||
>>> with seed(ParallelMode.DATA):
|
||||
>>> output = F.dropout(input)
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
@@ -124,18 +124,18 @@ def with_seed(func, parallel_mode: ParallelMode):
|
||||
"""
|
||||
A function wrapper which executes the function with a specified seed.
|
||||
|
||||
Examples::
|
||||
Examples:
|
||||
|
||||
# use with decorator
|
||||
@with_seed(ParallelMode.DATA)
|
||||
def forward(input):
|
||||
return F.dropout(input)
|
||||
out = forward(input)
|
||||
# OR use it inline
|
||||
def forward(input):
|
||||
return F.dropout(input)
|
||||
wrapper_forward = with_seed(forward, ParallelMode.DATA)
|
||||
out = wrapped_forward(input)
|
||||
>>> # use with decorator
|
||||
>>> @with_seed(ParallelMode.DATA)
|
||||
>>> def forward(input):
|
||||
>>> return F.dropout(input)
|
||||
>>> out = forward(input)
|
||||
>>> # OR use it inline
|
||||
>>> def forward(input):
|
||||
>>> return F.dropout(input)
|
||||
>>> wrapper_forward = with_seed(forward, ParallelMode.DATA)
|
||||
>>> out = wrapped_forward(input)
|
||||
|
||||
Note:
|
||||
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
|
||||
|
@@ -12,7 +12,8 @@ class ColoProxy(Proxy):
|
||||
ColoProxy is a proxy class which uses meta tensor to handle data-dependent control flow. The original torch.fx proxy
|
||||
cannot be used to infer the condition statement, with this proxy, torch.fx can still run even with if statements.
|
||||
|
||||
Usage:
|
||||
Example::
|
||||
|
||||
proxy = tracer.create_proxy(...)
|
||||
proxy.meta_data = torch.empty(4, 2, device='meta')
|
||||
print(len(proxy)) # expect output 4
|
||||
|
@@ -35,7 +35,8 @@ class ColoTracer(Tracer):
|
||||
ColoTracer is a symbolic tracer designed to support dynamic control flow by using meta tensors for the `colossalai.fx` module.
|
||||
This tracer is initialized in the same way as the original torch.fx.Tracer.
|
||||
|
||||
Usage:
|
||||
Usage::
|
||||
|
||||
class Model(nn.Module):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
@@ -254,8 +255,8 @@ class ColoTracer(Tracer):
|
||||
non_meta_arg_names = sig_names - meta_arg_names
|
||||
for k, v in sig.parameters.items():
|
||||
if k in non_meta_arg_names and \
|
||||
k not in concrete_args and \
|
||||
v.default is not inspect.Parameter.empty:
|
||||
k not in concrete_args and \
|
||||
v.default is not inspect.Parameter.empty:
|
||||
concrete_args[k] = v.default
|
||||
|
||||
# get non concrete arg names
|
||||
|
@@ -13,6 +13,14 @@ class GeminiManager:
|
||||
|
||||
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
|
||||
https://arxiv.org/abs/2108.05818
|
||||
|
||||
Args:
|
||||
placement_policy (str): Which device to place *held* tensors. It can be 'cpu', 'cuda' and 'auto'.
|
||||
If it's 'cpu', parameters, gradients and optimizer states will be offloaded to CPU, which means min CUDA memory will be used.
|
||||
If it's 'cuda', they won't be offloaded, which means max CUDA memory will be used.
|
||||
If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
|
||||
Note that 'auto' policy can only work well when no other processes use CUDA during your training.
|
||||
chunk_manager (ChunkManager): A ``ChunkManager`` instance.
|
||||
"""
|
||||
|
||||
def __init__(self, placement_policy: str, chunk_manager: ChunkManager) -> None:
|
||||
|
@@ -40,7 +40,7 @@ def _cast_float(args, dtype: torch.dtype):
|
||||
class ColoDDP(torch.nn.Module):
|
||||
"""Distributed data parallel for ColoTensor. Nested ColoDDP is not supported now.
|
||||
|
||||
Example::
|
||||
Example:
|
||||
>>> from colossalai.core import global_context as gpc
|
||||
>>> from colossalai.context import ParallelMode
|
||||
>>> model = torch.nn.Linear(20, 1)
|
||||
@@ -148,7 +148,7 @@ class ColoDDP(torch.nn.Module):
|
||||
"""Sets parameters to be ignored by DDP.
|
||||
This method must be called before initializing ColoDDP.
|
||||
|
||||
Example::
|
||||
Example:
|
||||
>>> params_to_ignore = []
|
||||
>>> for p in module.parameters():
|
||||
>>> if should_ignore(p):
|
||||
@@ -174,7 +174,7 @@ class ZeroDDP(ColoDDP):
|
||||
We can configure chunk and gemini via ChunkManager and GeminiManager respectively.
|
||||
For more details, see the API reference of ``ChunkManager`` and ``GeminiManager``.
|
||||
|
||||
Example::
|
||||
Example:
|
||||
>>> model = torch.nn.Linear(20, 1)
|
||||
>>> placement_policy = 'cuda'
|
||||
>>> chunk_size = ChunkManager.search_chunk_size(model, search_range, n_grids) if use_chunk else None
|
||||
@@ -283,7 +283,7 @@ class ZeroDDP(ColoDDP):
|
||||
dict:
|
||||
a dictionary containing a whole state of the module
|
||||
|
||||
Example::
|
||||
Example:
|
||||
|
||||
>>> module.state_dict().keys()
|
||||
['bias', 'weight']
|
||||
|
@@ -21,7 +21,7 @@ def colo_op_impl(func):
|
||||
by ``__torch_function__`` dispatch and has a ColoTensor as any of its
|
||||
parameters, the function provided will be invoked for that operator.
|
||||
|
||||
Example::
|
||||
Example:
|
||||
>>> @colo_op_impl(torch.nn.functional.linear)
|
||||
>>> def my_custom_linear(types, args, kwargs, process_group):
|
||||
>>> ....
|
||||
|
@@ -41,7 +41,7 @@ class ParamOpHookManager:
|
||||
def use_hooks(*hooks: ParamOpHook):
|
||||
"""Change the param op hooks you use. Nested calling is allowed.
|
||||
|
||||
Example::
|
||||
Example:
|
||||
>>> with ParamOpHookManager.use_hooks(*hooks):
|
||||
>>> do_something()
|
||||
>>> with ParamOpHookManager.use_hooks():
|
||||
|
@@ -26,7 +26,7 @@ def named_params_with_colotensor(
|
||||
(string, Union[Tensor, ColoTensor]): Tuple containing
|
||||
the name and parameter (or ColoTensor parameter)
|
||||
|
||||
Example::
|
||||
Example:
|
||||
|
||||
>>> model = torch.nn.Linear(*linear_size)
|
||||
>>> delattr(model.weight)
|
||||
|
@@ -16,6 +16,30 @@ class OptimState(Enum):
|
||||
|
||||
|
||||
class ZeroOptimizer(ColossalaiOptimizer):
|
||||
"""A wrapper for optimizer. ``ZeroDDP`` and ``ZeroOptimizer`` implement Zero Redundancy Optimizer (ZeRO state-3).
|
||||
|
||||
Note:
|
||||
You must use ``ZeroDDP`` with ``ZeroOptimizer``.
|
||||
|
||||
Note:
|
||||
Make sure you set ``placement_policy`` of ``GeminiManager`` to `"auto"`,
|
||||
if you set ``gpu_margin_mem_ratio > 0``.
|
||||
|
||||
Args:
|
||||
optim (Optimizer): An Optimizer instance.
|
||||
module (ZeroDDP): A ``ZeroDDP`` instance.
|
||||
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
|
||||
which will be used when using hybrid CPU optimizer.
|
||||
This argument is meaningless when `placement_policy` of `GeminiManager` is not "auto".
|
||||
Defaults to 0.0.
|
||||
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
|
||||
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
|
||||
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
|
||||
backoff_factor (float, optional): backoff_factor used by DynamicGradScaler. Defaults to 0.5.
|
||||
growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
|
||||
hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
|
||||
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
|
||||
"""
|
||||
|
||||
def __init__(self,
|
||||
optim: Optimizer,
|
||||
|
Reference in New Issue
Block a user