mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-02 01:28:31 +00:00
[misc] update pre-commit and run all files (#4752)
* [misc] update pre-commit * [misc] run pre-commit * [misc] remove useless configuration files * [misc] ignore cuda for clang-format
This commit is contained in:
@@ -5,56 +5,81 @@ from colossalai.context import Config
|
||||
from .run import launch_multi_processes
|
||||
|
||||
|
||||
@click.command(help="Launch distributed training on a single node or multiple nodes",
|
||||
context_settings=dict(ignore_unknown_options=True))
|
||||
@click.option("-H",
|
||||
"-host",
|
||||
"--host",
|
||||
type=str,
|
||||
default=None,
|
||||
help="the list of hostnames to launch in the format <host1>,<host2>")
|
||||
@click.command(
|
||||
help="Launch distributed training on a single node or multiple nodes",
|
||||
context_settings=dict(ignore_unknown_options=True),
|
||||
)
|
||||
@click.option(
|
||||
"-H",
|
||||
"-host",
|
||||
"--host",
|
||||
type=str,
|
||||
default=None,
|
||||
help="the list of hostnames to launch in the format <host1>,<host2>",
|
||||
)
|
||||
@click.option(
|
||||
"--hostfile",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Hostfile path that defines the device pool available to the job, each line in the file is a hostname")
|
||||
@click.option("--include",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Specify computing devices to use during execution. String format is <host1>,<host2>,"
|
||||
" only effective when used with --hostfile.")
|
||||
help="Hostfile path that defines the device pool available to the job, each line in the file is a hostname",
|
||||
)
|
||||
@click.option(
|
||||
"--include",
|
||||
type=str,
|
||||
default=None,
|
||||
help="Specify computing devices to use during execution. String format is <host1>,<host2>,"
|
||||
" only effective when used with --hostfile.",
|
||||
)
|
||||
@click.option(
|
||||
"--exclude",
|
||||
type=str,
|
||||
default=None,
|
||||
help=
|
||||
"Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
|
||||
" only effective when used with --hostfile.")
|
||||
@click.option("--num_nodes",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Total number of worker nodes to use, only effective when used with --hostfile.")
|
||||
help="Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --include,"
|
||||
" only effective when used with --hostfile.",
|
||||
)
|
||||
@click.option(
|
||||
"--num_nodes",
|
||||
type=int,
|
||||
default=-1,
|
||||
help="Total number of worker nodes to use, only effective when used with --hostfile.",
|
||||
)
|
||||
@click.option("--nproc_per_node", type=int, default=None, help="Number of GPUs to use on each node.")
|
||||
@click.option("--master_port",
|
||||
type=int,
|
||||
default=29500,
|
||||
help="(optional) Port used by PyTorch distributed for communication during distributed training.")
|
||||
@click.option("--master_addr",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.")
|
||||
@click.option(
|
||||
"--master_port",
|
||||
type=int,
|
||||
default=29500,
|
||||
help="(optional) Port used by PyTorch distributed for communication during distributed training.",
|
||||
)
|
||||
@click.option(
|
||||
"--master_addr",
|
||||
type=str,
|
||||
default="127.0.0.1",
|
||||
help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.",
|
||||
)
|
||||
@click.option(
|
||||
"--extra_launch_args",
|
||||
type=str,
|
||||
default=None,
|
||||
help=
|
||||
"Set additional torch distributed launcher arguments such as --standalone. The format is --extra_launch_args arg1=1,arg2=2. "
|
||||
"This will be converted to --arg1=1 --arg2=2 during execution")
|
||||
help="Set additional torch distributed launcher arguments such as --standalone. The format is --extra_launch_args arg1=1,arg2=2. "
|
||||
"This will be converted to --arg1=1 --arg2=2 during execution",
|
||||
)
|
||||
@click.option("--ssh-port", type=int, default=None, help="(optional) the port used for ssh connection")
|
||||
@click.argument("user_script", type=str)
|
||||
@click.argument('user_args', nargs=-1)
|
||||
def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include: str, exclude: str, master_addr: str,
|
||||
master_port: int, extra_launch_args: str, ssh_port: int, user_script: str, user_args: str) -> None:
|
||||
@click.argument("user_args", nargs=-1)
|
||||
def run(
|
||||
host: str,
|
||||
hostfile: str,
|
||||
num_nodes: int,
|
||||
nproc_per_node: int,
|
||||
include: str,
|
||||
exclude: str,
|
||||
master_addr: str,
|
||||
master_port: int,
|
||||
extra_launch_args: str,
|
||||
ssh_port: int,
|
||||
user_script: str,
|
||||
user_args: str,
|
||||
) -> None:
|
||||
"""
|
||||
To launch multiple processes on a single node or multiple nodes via command line.
|
||||
|
||||
@@ -77,8 +102,8 @@ def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include:
|
||||
# run with hostfile excluding the hosts selected
|
||||
colossalai run --hostfile <file_path> --master_addr host1 --exclude host2 --nprocs_per_node 4 train.py
|
||||
"""
|
||||
if not user_script.endswith('.py'):
|
||||
click.echo(f'Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help')
|
||||
if not user_script.endswith(".py"):
|
||||
click.echo(f"Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help")
|
||||
exit()
|
||||
|
||||
args_dict = locals()
|
||||
|
Reference in New Issue
Block a user