Refactored docstring to google style

This commit is contained in:
Liang Bowen
2022-03-25 13:02:39 +08:00
committed by アマデウス
parent 53b1b6e340
commit ec5086c49c
94 changed files with 3389 additions and 2982 deletions

View File

@@ -114,10 +114,13 @@ class CheckpointFunction(torch.autograd.Function):
def checkpoint(function, activation_offload ,*args):
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint
"""Checkpoint the computation while preserve the rng states, modified from Pytorch torch.utils.checkpoint.
:param function: Describe the forward pass function. It should know how to handle the input tuples.
:param args: Tuple containing the parameters of the function
:return: Output of running function with provided args
Args:
function: Describe the forward pass function. It should know how to handle the input tuples.
args (list): Tuple containing the parameters of the function
Returns:
Output of running function with provided args.
"""
return CheckpointFunction.apply(function, activation_offload, *args)

View File

@@ -50,17 +50,17 @@ def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):
def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
"""This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
"""This is a function to generate the checkpoint path from the tuple
(checkpoint_dir, epoch, suffix, gpu_parallel_rank).
This is useful during generation and recuperation of the checkpoint.
:param checkpoint_dir: Set up a directory for saving checkpoints
:type checkpoint_dir: str
:param epoch: Epoch number (indicate how many epochs have you trained this model)
:type epoch: int
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional
:return: Checkpoint path to be generated
:rtype: path
Args:
checkpoint_dir (str): Set up a directory for saving checkpoints.
epoch (int): Epoch number (indicate how many epochs have you trained this model).
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
Returns:
str: The checkpoint path to be generated.
"""
ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
return os.path.join(checkpoint_dir, ckpt_filename)
@@ -74,12 +74,13 @@ def _ensure_directory_exists(filename: str):
def get_latest_checkpoint_pattern(suffix: str = ''):
"""Generate Regular expression of latest checkpoint's pattern
"""Generate Regular expression of the latest checkpoint's pattern.
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional
:return: Checkpoint pattern
:rtype: regular expression
Args:
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''.
Returns:
str: The regular expression of checkpoint pattern.
"""
ranks_name = _get_ranks_name()
pattern = r'epoch(\d+)-{}{}\.pt'.format(ranks_name, suffix)
@@ -88,16 +89,19 @@ def get_latest_checkpoint_pattern(suffix: str = ''):
def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
"""This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
"""This is a function to retrieve the latest checkpoint path from the tuple
(checkpoint_dir, suffix, gpu_parallel_rank).
This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.
:param checkpoint_dir: Directory for saving checkpoints
:type checkpoint_dir: str
:param suffix: Additional notation to specify the model or checkpoint, defaults to ''
:type suffix: str, optional
:raises FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given
:return: The latest checkpoint path to be retrieved
:rtype: path
Args:
checkpoint_dir (str): Directory for saving checkpoints
suffix (str, optional): Additional notation to specify the model or checkpoint, defaults to ''
Returns:
str: The latest retrieved checkpoint path.
Raises:
FileNotFoundError: Raise error when we cannot find the latest checkpoint file with inputs given.
"""
CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)
@@ -126,22 +130,19 @@ def save_checkpoint(checkpoint_path: str,
optimizer: torch.optim.Optimizer,
lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
**kwargs):
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model,
optimizer, lr_scheduler and etc. into a checkpoint dictionary.
"""Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as
model, optimizer, lr_scheduler etc. into a checkpoint dictionary.
This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
This method can be used for both :class:`colossalai.nn.BaseModel` and normal :class:`torch.nn.Module`.
:param checkpoint_path: Set up a directory for saving checkpoints
:type checkpoint_path: str
:param epoch: Epoch number (indicate how many epochs have you trained this model)
:type epoch: int
:param model: Model to be registered
:type model: torch.nn.Module
:param optimizer: Optimizer to be registered
:type optimizer: torch.optim.Optimizer
:param lr_scheduler: lr_scheduler to be registered, defaults to None
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
Args:
checkpoint_path (str): Set up a directory for saving checkpoints.
epoch (int): Epoch number (indicate how many epochs have you trained this model).
model (:class:`torch.nn.Module`): Model to be registered.
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to be registered.
lr_scheduler (Union[:class:`torch.optim.lr_scheduler`,
:class:`colossalai.nn.lr_scheduler`], optional): lr_scheduler to be registered, defaults to None.
kwargs (dict): additional parameters to be saved.
"""
# for compatibility with normal pytorch nn.Module
if hasattr(model, 'state_dict_for_save_checkpoint'):
@@ -165,31 +166,31 @@ def load_checkpoint(checkpoint_path: str,
finetune: bool = False,
strict: bool = True) -> Tuple:
"""Loads the checkpoint file.
If finetune is False, then we intend to continue/resume the training process from the checkpoint given.
So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler)
and its descendants.
If finetune is True, then only the weights and buffers of model should be reload.
If strict is True, then the keys of state_dict must exactly match the keys returned by this modules
state_dict() function.
and its descendants.
:param checkpoint_path: The exact and matched checkpoint_path directory to retrieve appropriate state_dict
:type checkpoint_path: str
:param model: Model to reload parameters and buffers
:type model: torch.nn.Module
:param optimizer: Optimizer to recuperate
:type optimizer: torch.optim.Optimizer
:param lr_scheduler: lr_scheduler to recuperate, defaults to None
:type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
:param finetune: Whether to finetune the model with new dataset or continue the pre-training, defaults to False
:type finetune: bool, optional
:param strict: Whether to strictly enforce that the keys in
:attr:`state_dict` of the checkpoint match the names of
parameters and buffers in model., defaults to True
:type strict: bool, optional
:raises ValueError: Raise error if the model/optimizer cannot successfully be recuperated
:return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
:rtype: Tuple
If finetune is True, then only the weights and buffers of model should be reloaded.
If strict is True, then the keys of state_dict must exactly match the keys returned
by this modules state_dict() function.
Args:
checkpoint_path (str): The exact and matched checkpoint_path directory to retrieve appropriate state_dict.
model (:class:`torch.nn.Module`): Model to reload parameters and buffers.
optimizer (Union[:class:`torch.optim.Optimizer`, :class:`colossalai.nn.optimizer`]): Optimizer to recuperate.
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`, optional):
lr_scheduler to recuperate, defaults to None.
finetune (bool, optional): Whether to finetune the model with new dataset or
continue the pre-training, defaults to False.
strict (bool, optional): Whether to strictly enforce that the keys in :attr:`state_dict`
of the checkpoint match the names of parameters and buffers in model, defaults to True.
Returns:
Tuple(int, ``checkpoint``): The tuple (the epoch number of the checkpoint retrieved, the checkpoint retrieved).
Raises:
ValueError: Raise error if the model/optimizer cannot successfully be recuperated
"""
# Load the checkpoint.
checkpoint = torch.load(checkpoint_path, map_location='cpu')

View File

@@ -27,10 +27,10 @@ from .multi_tensor_apply import multi_tensor_applier
def print_rank_0(msg: str, logger=None):
"""Print messages and save logs(optional). This is executed only if you are the rank-0 gpu.
:param msg: A string message to output
:type msg: str
:param logger: Python logger object, defaults to None
:type logger: optional
Args:
msg (str): A string message to output.
logger (:class:`colossalai.logging.DistributedLogger`, optional):
The logger to record the message, defaults to None.
"""
if gpc.get_global_rank() == 0:
if logger is None:
@@ -53,12 +53,15 @@ def free_port():
def sync_model_param(model, parallel_mode):
"""Make sure data parameters are consistent during Data Parallel Mode
r"""Make sure data parameters are consistent during Data Parallel Mode.
:param model: A pyTorch nn.model on whose parameters you check the consistency
:param parallel_mode: Parallel mode to be checked
:type model: torch.nn.Module
:type parallel_mode: colossalai.context.ParallelMode
Args:
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel mode to be checked.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
for param in model.parameters():
@@ -146,18 +149,19 @@ def clip_grad_norm_fp32(parameters, max_norm, norm_type=2):
"""Clips gradient norm of an iterable of parameters whose gradients are in fp32.
This is adapted from :func:`torch.nn.utils.clip_grad.clip_grad_norm_` and
added functionality to handle model parallel parameters. Note that
the gradients are modified in place.
added functionality to handle model parallel parameters.
:param parameters: An iterable of Tensors or a single Tensor that will have gradients normalized
:type parameters: (Iterable[Tensor] or Tensor)
:param max_norm: Max norm of the gradients
:type max_norm: float or int
:param norm_type: Type of the used p-norm. Can be ``'inf'`` for infinity norm.
:type norm_type: float or int
Note:
the gradients are modified in place.
:return: Total norm of the parameters (viewed as a single vector).
:rtype: float
Args:
parameters (Iterable[:class:`torch.tensor`] or :class:`torch.tensor`):
An iterable of Tensors or a single Tensor that will have gradients normalized.
max_norm (Union[float, int]): Max norm of the gradients.
norm_type (Union[float, int, 'inf']): Type of the used p-norm. Can be ``'inf'`` for infinity norm.
Returns:
float: Total norm of the parameters.
"""
if isinstance(parameters, torch.Tensor):

View File

@@ -19,18 +19,15 @@ T_co = TypeVar('T_co', covariant=True)
@DATA_SAMPLERS.register_module
class DataParallelSampler(Sampler):
"""A data sampler for distributed data parallelism
"""A data sampler for distributed data parallelism.
:param dataset: A Dataset instance
:type dataset: torch.utils.data.Dataset
:param shuffle: Whether to shuffle data, defaults to False
:type shuffle: bool, optional
:param seed: The random seed, defaults to 0
:type seed: int, optional
:param drop_last: Set to True to drop the last incomplete batch, if the dataset size is not divisible by the batch
size. If False and the size of dataset is not divisible by the batch size, then the last batch will be smaller,
defaults to False
:type drop_last: bool, optional
Args:
dataset (:class:`torch.utils.data.Dataset`): The Dataset for sampling.
shuffle (bool, optional): Whether to shuffle data, defaults to False.
seed (int, optional): The random seed used for sampling, defaults to 0.
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
is not divisible by the batch size. If False and the size of dataset is not divisible by
the batch size, then the last batch will be smaller, defaults to False.
"""
def __init__(self,
@@ -104,8 +101,8 @@ class DataParallelSampler(Sampler):
use a different random ordering for each epoch. Otherwise, the next iteration of this
sampler will yield the same ordering.
:param epoch: Epoch number.
:type epoch: int
Args:
epoch (int): Epoch number.
"""
self.epoch = epoch
@@ -118,29 +115,27 @@ def get_dataloader(dataset,
pin_memory=False,
num_workers=0,
**kwargs):
"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
r"""Set up a deterministic dataloader (also configure seed workers, samplers and whether shuffle or not)
.. note:: When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
on the 1st stage and label on the last stage
Note:
When pipeline parallel is enabled, shuffle cannot be True as it will result in mismatch between input data
on the 1st stage and label on the last stage.
:param dataset: A :class:`torch.utils.data.Dataset` object
:param shuffle: Whether to shuffle the dataset
:param seed: Random worker seed, defaults to 1024
:param add_sampler: Add DistributedDataParallelSampelr to the dataset
:param drop_last: Drop the last incomplete batch of data
:param pin_memory: Whether to pin memory address in CPU memory
:param num_workers: Number of worker threads for this dataloader
Args:
dataset (:class:`torch.utils.data.Dataset`): The dataset to be loaded.
shuffle (bool, optional): Whether to shuffle the dataset. Defaults to False.
seed (int, optional): Random worker seed for sampling, defaults to 1024.
add_sampler: Whether to add ``DistributedDataParallelSampler`` to the dataset. Defaults to True.
drop_last (bool, optional): Set to True to drop the last incomplete batch, if the dataset size
is not divisible by the batch size. If False and the size of dataset is not divisible by
the batch size, then the last batch will be smaller, defaults to False.
pin_memory (bool, optional): Whether to pin memory address in CPU memory. Defaults to False.
num_workers (int, optional): Number of worker threads for this dataloader. Defaults to 0.
kwargs (dict): optional parameters for ``torch.utils.data.DataLoader``, more details could be found in
`DataLoader <https://pytorch.org/docs/stable/_modules/torch/utils/data/dataloader.html#DataLoader>`_.
:type dataset: :class:`torch.utils.data.Dataset`
:type shuffle: bool, optional. Default is False
:type seed: int, optional. Default is 1024
:type add_sampler: bool, optional. Default is True
:type drop_last: bool, optional. Default is False
:type pin_memory: bool, optional. Default is False
:type num_workers: int, optional. Default is 0
:return: A object of :class:`torch.utils.data.DataLoader`
:rtype: :class:`torch.utils.data.DataLoader`
Returns:
:class:`torch.utils.data.DataLoader`: A DataLoader used for training or testing.
"""
_kwargs = kwargs.copy()

View File

@@ -13,19 +13,25 @@ def accumulate_gradient(model: nn.Module,
accumulate_size: int,
gradient_handlers: List[BaseGradientHandler] = None,
lr_scheduler: _LRScheduler = None):
"""
:param model: your model object
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer object
:type optimizer: :class:`torch.optim.Optimizer`
:param dataloader: your dataloader object
:type dataloader: Iterable
:param accumulate_size: the number of steps to accumulate gradients
:type accumulate_size: int
:param gradient_handlers: list of gradient handler objects. Default is None
:type gradient_handlers: List[:class:`colossalai.engine.BaseGradientHandler`]
:param lr_scheduler: your lr scheduler object. Default is None
:type lr_scheduler: `torch.optim.lr_scheduler._LRScheduler`
r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
Args:
model (:class:`torch.nn.Module`): your model object for gradient accumulation.
optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
your dataloader object, would be called like iter(dataloader)
accumulate_size (int): the number of steps to accumulate gradients
gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
list of gradient handler objects. Default is None.
lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
More details about `gradient_handlers` could be found in
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
More details about `lr_scheduler` could be found
`lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
`how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
"""
optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)

View File

@@ -15,15 +15,13 @@ from colossalai.engine import BaseGradientHandler
class GradAccumOptimizer(ColossalaiOptimizer):
"""A wrapper for the optimizer to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param optim: Your optimizer object
:type optim: :class:`torch.optim.Optimizer`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
:param model: Your model object to check if it is DDP for special handling of no_sync() context
:type model: :class:`torch.nn.Module`
before accumulation size is reached.
Args:
optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
model (:class:`torch.nn.Module`):
Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
"""
def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
@@ -76,18 +74,18 @@ class GradAccumOptimizer(ColossalaiOptimizer):
class GradAccumDataloader:
"""A wrapper for dataloder to enable gradient accumulation by dropping the last incomplete steps.
"""A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model paramters will
be update only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
:param dataloader: Your dataloader object
:type dataloader: Iterable
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
Note:
The dataloader would drop the last incomplete steps for gradient accumulation.
For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
(e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
Args:
optim (``Iterable``): Your dataloader object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
"""
def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
@@ -125,13 +123,12 @@ class GradAccumDataloader:
class GradAccumLrSchedulerByStep(_LRScheduler):
"""A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps
before accumulation size is reached
:param lr_scheduler: Your lr scheduler object
:type lr_scheduler: :class:`torch.optim.lr_scheduler._LRScheduler`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
before accumulation size is reached.
Args:
lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
Your ``lr_scheduler`` object for gradient accumulation.
accumulate_size (int): The number of steps to accumulate gradients.
"""
def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
@@ -171,13 +168,16 @@ class GradAccumLrSchedulerByStep(_LRScheduler):
class GradAccumGradientHandler:
"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached
r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
before accumulation size is reached.
:param grad_handler: Your gradient handler object
:type grad_handler: :class:`colossalai.engine.BaseGradientHandler`
:param accumulate_size: The number of steps to accumulate gradients
:type accumulate_size: int
Args:
grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
accumulate_size (int): The number of steps to accumulate gradients.
More details about ``gradient_handlers`` could be found in
`Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
"""

View File

@@ -14,12 +14,13 @@ from typing import Optional
def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
"""
Get the free memory info of device.
:param device: a torch device instance or None
:type device: Optional[torch.device]
:return: current memory usage, sized by Byte
:rtype: int
"""Get the free memory info of device.
Args:
device (Optional[``torch.device``]): a torch device instance or None. Defaults None.
Returns:
int: current memory usage, sized by Byte.
"""
if device:
assert device.type == 'cuda'
@@ -34,7 +35,7 @@ def colo_cuda_memory_used(device: Optional[torch.device] = None) -> int:
def bytes_to_GB(val, decimal=2):
"""A byte-to-Gigabyte converter, defaultly using binary notation.
"""A byte-to-Gigabyte converter, default using binary notation.
:param val: X bytes to convert
:return: X' GB
@@ -43,7 +44,7 @@ def bytes_to_GB(val, decimal=2):
def bytes_to_MB(val, decimal=2):
"""A byte-to-Megabyte converter, defaultly using binary notation.
"""A byte-to-Megabyte converter, default using binary notation.
:param val: X bytes to convert
:return: X' MB
@@ -54,13 +55,13 @@ def bytes_to_MB(val, decimal=2):
def report_memory_usage(message, logger=None, report_cpu=False):
"""Calculate and print RAM usage (in GB)
:param message: A prefix message to add in the log
:type message: str
:param logger: An instance of :class:`colossalai.logging.DistributedLogger`
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
:param report_cpu: Whether to report CPU memory
:type report_cpu: bool, optional
:raises EnvironmentError: Raise error if no distributed environment has been initialized
Args:
message (str): A prefix message to add in the log.
logger (:class:`colossalai.logging.DistributedLogger`): The logger used to record memory information.
report_cpu (bool, optional): Whether to report CPU memory.
Raises:
EnvironmentError: Raise error if no distributed environment has been initialized.
"""
if not gpc.is_initialized(ParallelMode.GLOBAL):
raise EnvironmentError("No distributed environment is initialized")

View File

@@ -12,8 +12,8 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
size of every parameter. Since the parameters in data parallelism is replicated
in each GPU, we set their ep_size to 1.
:param model: A pyTorch nn.model from which we get dict
:type model: torch.nn.Module
Args:
model (:class:`torch.nn.Module`): A pyTorch `nn.Module` from which we get dict.
"""
epsize_param_dict = dict()
for param in model.parameters():
@@ -29,10 +29,10 @@ def get_moe_epsize_param_dict(model: nn.Module) -> Dict[int, List[nn.Parameter]]
def sync_moe_model_param(model: nn.Module):
"""Make sure model parameters are consistent in MoE parallel context
"""Make sure model parameters are consistent in MoE parallel context.
:param model: A pyTorch nn.model on whose parameters you check the consistency
:type model: torch.nn.Module
Args:
model (:class:`torch.nn.Module`): A pyTorch model on whose parameters you check the consistency.
"""
if is_using_ddp():

View File

@@ -3,10 +3,10 @@
class MultiTensorApply(object):
"""
Apply an operation to a list of tensors efficiently
Apply an operation to a list of tensors efficiently.
:param chunk_size: Size of a chunk
:type chunk_size: int
Args:
chunk_size (int): Size of a chunk.
"""
available = False

View File

@@ -9,6 +9,7 @@ from collections import defaultdict
LINE_WIDTH = 108
LINE = '-' * LINE_WIDTH + '\n'
class TensorDetector():
def __init__(self,
show_info: bool = True,
@@ -16,17 +17,14 @@ class TensorDetector():
include_cpu: bool = False,
module: Optional[nn.Module] = None
):
"""This class is an detector to detect tensor on different devices.
:param show_info: whether to print the info on screen, default True
:type show_info: bool
:param log: the file name to save the log
:type log: str
:param include_cpu: whether to detect tensor on cpu, default False
:type include_cpu: bool
:param module: when sending an `nn.Module` it, the detector can name the tensors detected better
:type module: Optional[nn.Module]
"""This class is a detector to detect tensor on different devices.
Args:
show_info (bool, optional): whether to print the info on screen, default True.
log (str, optional): the file name to save the log. Defaults to None.
include_cpu (bool, optional): whether to detect tensor on cpu, default False.
module (Optional[:class:`nn.Module`]): when sending an ``nn.Module`` object,
the detector can name the tensors detected better.
"""
self.show_info = show_info
self.log = log
@@ -48,7 +46,6 @@ class TensorDetector():
self.tensor_info[id(param)].append(param.requires_grad)
self.tensor_info[id(param)].append(param.dtype)
self.tensor_info[id(param)].append(self.get_tensor_mem(param))
def get_tensor_mem(self, tensor):
# calculate the memory occupied by a tensor
@@ -58,7 +55,6 @@ class TensorDetector():
memory_size += grad_memory_size
return self.mem_format(memory_size)
def mem_format(self, real_memory_size):
# format the tensor memory into a reasonal magnitude
if real_memory_size >= 2 ** 30:
@@ -68,7 +64,6 @@ class TensorDetector():
if real_memory_size >= 2 ** 10:
return str(real_memory_size / (2 ** 10)) + ' KB'
return str(real_memory_size) + ' B'
def collect_tensors_state(self):
for obj in gc.get_objects():
@@ -116,7 +111,6 @@ class TensorDetector():
if obj.device not in self.devices:
self.devices.append(obj.device)
def print_tensors_state(self):
template_format = '{:3s}{:<30s}{:>10s}{:>20s}{:>10s}{:>20s}{:>15s}'
self.info += LINE
@@ -173,7 +167,6 @@ class TensorDetector():
if self.log is not None:
with open(self.log + '.log', 'a') as f:
f.write(self.info)
def detect(self, include_cpu = False):
self.include_cpu = include_cpu

View File

@@ -25,7 +25,7 @@ class Timer:
return time.time()
def start(self):
"""Fisrtly synchronize cuda, reset the clock and then start the timer.
"""Firstly synchronize cuda, reset the clock and then start the timer.
"""
self._elapsed = 0
synchronize()
@@ -40,10 +40,11 @@ class Timer:
def stop(self, keep_in_history: bool = False):
"""Stop the timer and record the start-stop time interval.
:param keep_in_history: Whether does it record into history each start-stop interval, defaults to False
:type keep_in_history: bool, optional
:return: Start-stop interval
:rtype: int
Args:
keep_in_history (bool, optional): Whether does it record into history
each start-stop interval, defaults to False.
Returns:
int: Start-stop interval.
"""
synchronize()
end_time = time.time()
@@ -57,26 +58,27 @@ class Timer:
def get_history_mean(self):
"""Mean of all history start-stop time intervals.
:return: Mean of time intervals
:rtype: int
Returns:
int: Mean of time intervals
"""
return sum(self._history) / len(self._history)
def get_history_sum(self):
"""Add up all the start-stop time intervals.
:return: Sum of time intervals
:rtype: int
Returns:
int: Sum of time intervals.
"""
return sum(self._history)
def get_elapsed_time(self):
"""Return the last start-stop time interval.
.. note:: Use it only when timer is not in progress
Returns:
int: The last time interval.
:return: The last time interval
:rtype: int
Note:
Use it only when timer is not in progress
"""
assert not self._started, 'Timer is still in progress'
return self._elapsed
@@ -90,10 +92,10 @@ class Timer:
class MultiTimer:
"""An object contains multiple timers
"""An object contains multiple timers.
:param on: Whether the timer is enabled. Default is True
:type on: bool, optional
Args:
on (bool, optional): Whether the timer is enabled. Default is True.
"""
def __init__(self, on: bool = True):
@@ -101,10 +103,10 @@ class MultiTimer:
self._timers = dict()
def start(self, name: str):
"""Start namely one of the timers
"""Start namely one of the timers.
:param name: Timer's key
:type name: str
Args:
name (str): Timer's key.
"""
if self._on:
if name not in self._timers:
@@ -114,10 +116,9 @@ class MultiTimer:
def stop(self, name: str, keep_in_history: bool):
"""Stop namely one of the timers.
:param name: Timer's key
:type name: str
:param keep_in_history: Whether does it record into history each start-stop interval
:type keep_in_history: bool
Args:
name (str): Timer's key.
keep_in_history (bool): Whether does it record into history each start-stop interval.
"""
if self._on:
return self._timers[name].stop(keep_in_history)
@@ -127,17 +128,19 @@ class MultiTimer:
def get_timer(self, name):
"""Get timer by its name (from multitimer)
:param name: Timer's key
:return: Timer with the name you give correctly
:rtype: Timer
Args:
name (str): Timer's key.
Returns:
:class:`colossalai.utils.Timer`: Timer with the name you give correctly.
"""
return self._timers[name]
def reset(self, name=None):
"""Reset timers.
:param name: If name is designated, the named timer will be reset and others will not, defaults to None
:type name: optional
Args:
name (str, optional): If name is designated, the named timer will be reset
and others will not, defaults to None.
"""
if self._on:
if name is not None: