[doc] update rst and docstring (#1351)

* update rst

* add zero docstr

* fix docstr

* remove fx.tracer.meta_patch

* fix docstr

* fix docstr

* update fx rst

* fix fx docstr

* remove useless rst
This commit is contained in:
ver217
2022-07-21 15:54:53 +08:00
committed by GitHub
parent 274c1a3b5f
commit d068af81a3
132 changed files with 724 additions and 146 deletions

View File

@@ -60,7 +60,7 @@ class GradScaler(object):
* ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
* ``scaler.update()`` updates ``scaler``'s scale factor.
Example::
Example:
# Creates a GradScaler once at the beginning of training.
scaler = GradScaler()

View File

@@ -102,10 +102,10 @@ def sync_states():
def seed(parallel_mode: ParallelMode):
""" A context for seed switch
Examples::
Examples:
with seed(ParallelMode.DATA):
output = F.dropout(input)
>>> with seed(ParallelMode.DATA):
>>> output = F.dropout(input)
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
@@ -124,18 +124,18 @@ def with_seed(func, parallel_mode: ParallelMode):
"""
A function wrapper which executes the function with a specified seed.
Examples::
Examples:
# use with decorator
@with_seed(ParallelMode.DATA)
def forward(input):
return F.dropout(input)
out = forward(input)
# OR use it inline
def forward(input):
return F.dropout(input)
wrapper_forward = with_seed(forward, ParallelMode.DATA)
out = wrapped_forward(input)
>>> # use with decorator
>>> @with_seed(ParallelMode.DATA)
>>> def forward(input):
>>> return F.dropout(input)
>>> out = forward(input)
>>> # OR use it inline
>>> def forward(input):
>>> return F.dropout(input)
>>> wrapper_forward = with_seed(forward, ParallelMode.DATA)
>>> out = wrapped_forward(input)
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found

View File

@@ -12,7 +12,8 @@ class ColoProxy(Proxy):
ColoProxy is a proxy class which uses meta tensor to handle data-dependent control flow. The original torch.fx proxy
cannot be used to infer the condition statement, with this proxy, torch.fx can still run even with if statements.
Usage:
Example::
proxy = tracer.create_proxy(...)
proxy.meta_data = torch.empty(4, 2, device='meta')
print(len(proxy)) # expect output 4

View File

@@ -35,7 +35,8 @@ class ColoTracer(Tracer):
ColoTracer is a symbolic tracer designed to support dynamic control flow by using meta tensors for the `colossalai.fx` module.
This tracer is initialized in the same way as the original torch.fx.Tracer.
Usage:
Usage::
class Model(nn.Module):
def __init__(self):
super().__init__()
@@ -254,8 +255,8 @@ class ColoTracer(Tracer):
non_meta_arg_names = sig_names - meta_arg_names
for k, v in sig.parameters.items():
if k in non_meta_arg_names and \
k not in concrete_args and \
v.default is not inspect.Parameter.empty:
k not in concrete_args and \
v.default is not inspect.Parameter.empty:
concrete_args[k] = v.default
# get non concrete arg names

View File

@@ -13,6 +13,14 @@ class GeminiManager:
PatrickStar: Parallel Training of Pre-trained Models via Chunk-based Memory Management
https://arxiv.org/abs/2108.05818
Args:
placement_policy (str): Which device to place *held* tensors. It can be 'cpu', 'cuda' and 'auto'.
If it's 'cpu', parameters, gradients and optimizer states will be offloaded to CPU, which means min CUDA memory will be used.
If it's 'cuda', they won't be offloaded, which means max CUDA memory will be used.
If it's 'auto', they are moving dynamically based on CPU and CUDA memory usage. It will utilize heterogeneous memory space evenly and well.
Note that 'auto' policy can only work well when no other processes use CUDA during your training.
chunk_manager (ChunkManager): A ``ChunkManager`` instance.
"""
def __init__(self, placement_policy: str, chunk_manager: ChunkManager) -> None:

View File

@@ -40,7 +40,7 @@ def _cast_float(args, dtype: torch.dtype):
class ColoDDP(torch.nn.Module):
"""Distributed data parallel for ColoTensor. Nested ColoDDP is not supported now.
Example::
Example:
>>> from colossalai.core import global_context as gpc
>>> from colossalai.context import ParallelMode
>>> model = torch.nn.Linear(20, 1)
@@ -148,7 +148,7 @@ class ColoDDP(torch.nn.Module):
"""Sets parameters to be ignored by DDP.
This method must be called before initializing ColoDDP.
Example::
Example:
>>> params_to_ignore = []
>>> for p in module.parameters():
>>> if should_ignore(p):
@@ -174,7 +174,7 @@ class ZeroDDP(ColoDDP):
We can configure chunk and gemini via ChunkManager and GeminiManager respectively.
For more details, see the API reference of ``ChunkManager`` and ``GeminiManager``.
Example::
Example:
>>> model = torch.nn.Linear(20, 1)
>>> placement_policy = 'cuda'
>>> chunk_size = ChunkManager.search_chunk_size(model, search_range, n_grids) if use_chunk else None
@@ -283,7 +283,7 @@ class ZeroDDP(ColoDDP):
dict:
a dictionary containing a whole state of the module
Example::
Example:
>>> module.state_dict().keys()
['bias', 'weight']

View File

@@ -21,7 +21,7 @@ def colo_op_impl(func):
by ``__torch_function__`` dispatch and has a ColoTensor as any of its
parameters, the function provided will be invoked for that operator.
Example::
Example:
>>> @colo_op_impl(torch.nn.functional.linear)
>>> def my_custom_linear(types, args, kwargs, process_group):
>>> ....

View File

@@ -41,7 +41,7 @@ class ParamOpHookManager:
def use_hooks(*hooks: ParamOpHook):
"""Change the param op hooks you use. Nested calling is allowed.
Example::
Example:
>>> with ParamOpHookManager.use_hooks(*hooks):
>>> do_something()
>>> with ParamOpHookManager.use_hooks():

View File

@@ -26,7 +26,7 @@ def named_params_with_colotensor(
(string, Union[Tensor, ColoTensor]): Tuple containing
the name and parameter (or ColoTensor parameter)
Example::
Example:
>>> model = torch.nn.Linear(*linear_size)
>>> delattr(model.weight)

View File

@@ -16,6 +16,30 @@ class OptimState(Enum):
class ZeroOptimizer(ColossalaiOptimizer):
"""A wrapper for optimizer. ``ZeroDDP`` and ``ZeroOptimizer`` implement Zero Redundancy Optimizer (ZeRO state-3).
Note:
You must use ``ZeroDDP`` with ``ZeroOptimizer``.
Note:
Make sure you set ``placement_policy`` of ``GeminiManager`` to `"auto"`,
if you set ``gpu_margin_mem_ratio > 0``.
Args:
optim (Optimizer): An Optimizer instance.
module (ZeroDDP): A ``ZeroDDP`` instance.
gpu_margin_mem_ratio (float, optional): The ratio of GPU remaining memory (after the first forward-backward)
which will be used when using hybrid CPU optimizer.
This argument is meaningless when `placement_policy` of `GeminiManager` is not "auto".
Defaults to 0.0.
initial_scale (float, optional): Initial scale used by DynamicGradScaler. Defaults to 2**32.
min_scale (float, optional): Min scale used by DynamicGradScaler. Defaults to 1.
growth_factor (float, optional): growth_factor used by DynamicGradScaler. Defaults to 2.
backoff_factor (float, optional): backoff_factor used by DynamicGradScaler. Defaults to 0.5.
growth_interval (float, optional): growth_interval used by DynamicGradScaler. Defaults to 1000.
hysteresis (float, optional): hysteresis used by DynamicGradScaler. Defaults to 2.
max_scale (int, optional): max_scale used by DynamicGradScaler. Defaults to 2**32.
"""
def __init__(self,
optim: Optimizer,