Fixed docstring in colossalai (#171)

This commit is contained in:
HELSON
2022-01-21 10:44:30 +08:00
committed by GitHub
parent e2089c5c15
commit 0f8c7f9804
77 changed files with 983 additions and 603 deletions

View File

@@ -29,12 +29,12 @@ from colossalai.global_variables import moe_env
def get_default_parser():
'''Reads user command line and uses an argument parser to parse the input arguments.
"""Reads user command line and uses an argument parser to parse the input arguments.
Input arguments include configuration, host, port, world size, local rank, backend for torch.distributed.
:return: returns the parser with the default arguments, the user may add customized arguments into this parser
:return: Returns the parser with the default arguments, the user may add customized arguments into this parser
:rtype: Namespace
'''
"""
parser = argparse.ArgumentParser()
parser.add_argument('--config', type=str, help='path to the config file')
parser.add_argument('--host',
@@ -64,28 +64,30 @@ def launch(config: Union[str, Path, Config, Dict],
local_rank: int = None,
seed: int = 1024,
verbose: bool = True):
'''This function first parses the configuration arguments, using :func:parse_args() in case one of the input arguments are not given.
Then initialize and set distributed environment by calling global_context's functions.
"""This function first parses the configuration arguments, using :func:`parse_args()` in case one of the input
arguments are not given. Then initialize and set distributed environment by calling global_context's functions.
:param config: config file or config file path are both acceptable
:param config: Config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param rank: rank for the default process group
:param rank: Rank for the default process group
:type rank: int
:param world_size: world size of the default process group
:param world_size: World size of the default process group
:type world_size: int
:param host: the master address for distributed training
:param host: The master address for distributed training
:type host: str
:param port: the master port for distributed training
:param port: The master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param local_rank: rank for the process on the node and is used to set the default CUDA device,
defaults to None. If local_rank = None, the default device ordinal will be calculated automatically
:param backend: Backend for torch.distributed
:type backend: str, optional
:param local_rank: Rank for the process on the node and is used to set the default CUDA device, defaults to None.
If local_rank = None, the default device ordinal will be calculated automatically
:type local_rank: int, optional
:param verbose: whether to print logs
:type verbose: bool
:raises Exception: raise exception when config type is wrong
'''
:param seed: Specified random seed for every processes
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
:raises Exception: Raise exception when config type is wrong
"""
gpc.verbose = verbose
# set config
@@ -123,20 +125,22 @@ def launch_from_slurm(config: Union[str, Path, Config, Dict],
backend: str = 'nccl',
seed: int = 1024,
verbose: bool = True):
'''A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
"""A wrapper for colossalai.launch for SLURM launcher by reading rank and world size from the environment variables
set by SLURM
:param config: config file or config file path are both acceptable
:param config: Config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:param host: The master address for distributed training
:type host: str
:param port: the master port for distributed training
:param port: The master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
:param backend: Backend for torch.distributed
:type backend: str, optional
:param seed: Specified random seed for every processes
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
"""
rank = int(os.environ['SLURM_PROCID'])
world_size = int(os.environ['SLURM_NPROCS'])
launch(config=config,
@@ -155,20 +159,22 @@ def launch_from_openmpi(config: Union[str, Path, Config, Dict],
backend: str = 'nccl',
seed: int = 1024,
verbose: bool = True):
'''A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
"""A wrapper for colossalai.launch for OpenMPI launcher by reading rank and world size from the environment variables
set by OpenMPI
:param config: config file or config file path are both acceptable
:param config: Config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:param host: The master address for distributed training
:type host: str
:param port: the master port for distributed training
:param port: The master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
:param backend: Backend for torch.distributed
:type backend: str, optional
:param seed: Specified random seed for every processes
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
"""
rank = int(os.environ['OMPI_COMM_WORLD_RANK'])
local_rank = int(os.environ['OMPI_COMM_WORLD_LOCAL_RANK'])
world_size = int(os.environ['OMPI_COMM_WORLD_SIZE'])
@@ -187,20 +193,18 @@ def launch_from_torch(config: Union[str, Path, Config, Dict],
backend: str = 'nccl',
seed: int = 1024,
verbose: bool = True):
'''A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
"""A wrapper for colossalai.launch for torchrun or torch.distributed.launch by reading rank and world size
from the environment variables set by PyTorch
:param config: config file or config file path are both acceptable
:param config: Config file or config file path are both acceptable
:type config: Union[str, dict, Config]
:param host: the master address for distributed training
:type host: str
:param port: the master port for distributed training
:type port: str
:param backend: backend for torch.distributed
:type backend: str
:param verbose: whether to print logs
:type verbose: bool
'''
:param backend: Backend for torch.distributed
:type backend: str, optional
:param seed: Specified random seed for every processes
:type seed: int, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
"""
rank = int(os.environ['RANK'])
local_rank = int(os.environ['LOCAL_RANK'])
world_size = int(os.environ['WORLD_SIZE'])
@@ -225,25 +229,26 @@ def initialize(model: Union[nn.Module, List[nn.Module]],
lr_scheduler: _LRScheduler = None,
verbose: bool = True
) -> Tuple[Engine, DataLoader, DataLoader, _LRScheduler]:
''' Core function to wrap the essential training components with our functionality based on the config which is loaded into gpc.config.
"""Core function to wrap the essential training components with our functionality based on the config which is
loaded into gpc.config.
:param model: your model instance
:param model: Your model instance
:type model: :class:`torch.nn.Module`
:param optimizer: your optimizer instance
:param optimizer: Your optimizer instance
:type optimizer: :class:`torch.optim.optimizer.Optimizer`
:param criterion: your criterion instance
:param criterion: Your criterion instance
:type criterion: :class:`torch.nn.modules.loss._Loss`
:param train_dataloader: dataloader for training data
:type train_dataloader: :class:`torch.utils.data.DataLoader`
:param train_dataloader: dataloader for testing data
:type train_dataloader: :class:`torch.utils.data.DataLoader`
:param lr_scheduler: your lr scheduler instance
:type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`
:param verbose: whether to print logs
:type verbose: bool
:param train_dataloader: Dataloader for training
:type train_dataloader: :class:`torch.utils.data.DataLoader`, optional
:param test_dataloader: Dataloader for testing
:type test_dataloader: :class:`torch.utils.data.DataLoader`, optional
:param lr_scheduler: Your lr scheduler instance
:type lr_scheduler: :class:`torch.nn.lr_scheduler._LRScheduler`, optional
:param verbose: Whether to print logs
:type verbose: bool, optional
:return: (engine, train_dataloader, test_dataloader, lr_scheduler)
:rtype: tuple
'''
:rtype: Tuple
"""
# get logger
logger = get_dist_logger()
gpc.verbose = verbose