[misc] refactor launch API and tensor constructor (#5666)

* [misc] remove config arg from initialize * [misc] remove old tensor contrusctor * [plugin] add npu support for ddp * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [devops] fix doc test ci * [test] fix test launch * [doc] update launch doc --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2026-05-04 01:48:43 +00:00 · 2024-04-29 10:40:11 +08:00
parent 91fa553775
commit 7f8b16635b
223 changed files with 294 additions and 403 deletions
--- a/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -75,7 +75,7 @@ WARMUP_FRACTION = 0.1
 we create a distributed environment.
 ```python
 # Launch ColossalAI
-colossalai.launch_from_torch(config={}, seed=42)
+colossalai.launch_from_torch( seed=42)
 coordinator = DistCoordinator()
 ```
 prepare the dataset. You can use `plugin.prepare_dataloader` to generate a dataloader or customize your own dataloader.
--- a/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/en/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -71,7 +71,7 @@ PP_SIZE = 2
 Create a distributed environment.
 ```python
 # Launch ColossalAI
-colossalai.launch_from_torch(config={}, seed=SEEDå)
+colossalai.launch_from_torch( seed=SEEDå)
 coordinator = DistCoordinator()
 world_size = coordinator.world_size
 ```
--- a/docs/source/en/basics/booster_api.md
+++ b/docs/source/en/basics/booster_api.md
@@ -55,7 +55,7 @@ from colossalai.booster.plugin import TorchDDPPlugin

 def train():
    # launch colossalai
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    colossalai.launch(rank=rank, world_size=world_size, port=port, host='localhost')

    # create plugin and objects for training
    plugin = TorchDDPPlugin()
--- a/docs/source/en/basics/launch_colossalai.md
+++ b/docs/source/en/basics/launch_colossalai.md
@@ -87,8 +87,7 @@ import colossalai
 args = colossalai.get_default_parser().parse_args()

 # launch distributed environment
-colossalai.launch(config=args.config,
-                  rank=args.rank,
+colossalai.launch(rank=args.rank,
                  world_size=args.world_size,
                  host=args.host,
                  port=args.port,
@@ -106,20 +105,11 @@ First, we need to set the launch method in our code. As this is a wrapper of the
 use `colossalai.launch_from_torch`. The arguments required for distributed environment such as rank, world size, host and port are all set by the PyTorch
 launcher and can be read from the environment variable directly.

-config.py
-```python
-BATCH_SIZE = 512
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 2
-```
 train.py
 ```python
 import colossalai

-colossalai.launch_from_torch(
-    config="./config.py",
-)
+colossalai.launch_from_torch()
 ...
 ```

@@ -203,7 +193,6 @@ Do this in your training script:
 import colossalai

 colossalai.launch_from_slurm(
-    config=<CONFIG>,
    host=args.host,
    port=args.port
 )
@@ -224,7 +213,6 @@ use them to start the distributed backend.
 Do this in your train.py:
 ```python
 colossalai.launch_from_openmpi(
-    config=<CONFIG>,
    host=args.host,
    port=args.port
 )
@@ -238,3 +226,5 @@ mpirun --hostfile <my_hostfile> -np <num_process> python train.py --host <node n

 - --hostfile: use this option to specify a list of hosts on which to run
 - --np: set the number of processes (GPUs) to launch in total. For example, if --np 4, 4 python processes will be initialized to run train.py.
+
+<!-- doc-test-command: echo  -->
--- a/docs/source/en/features/gradient_accumulation_with_booster.md
+++ b/docs/source/en/features/gradient_accumulation_with_booster.md
@@ -45,7 +45,7 @@ We then need to initialize distributed environment. For demo purpose, we uses `l
 parser = colossalai.get_default_parser()
 args = parser.parse_args()
 # launch from torch
-colossalai.launch_from_torch(config=dict())
+colossalai.launch_from_torch()
 ```

 ### Step 3. Create training components
--- a/docs/source/en/features/gradient_clipping_with_booster.md
+++ b/docs/source/en/features/gradient_clipping_with_booster.md
@@ -61,7 +61,7 @@ We then need to initialize distributed environment. For demo purpose, we uses `l
 for other initialization methods.

 ```python
-colossalai.launch_from_torch(config=dict())
+colossalai.launch_from_torch()
 logger = get_dist_logger()
 ```

--- a/docs/source/en/features/lazy_init.md
+++ b/docs/source/en/features/lazy_init.md
@@ -29,7 +29,7 @@ from colossalai.booster.plugin import GeminiPlugin

 from transformers import LlamaForCausalLM, LlamaConfig, BertForPreTraining

-colossalai.launch({})
+colossalai.launch()
 plugin = GeminiPlugin()
 booster = Booster(plugin)

--- a/docs/source/en/features/mixed_precision_training_with_booster.md
+++ b/docs/source/en/features/mixed_precision_training_with_booster.md
@@ -20,10 +20,10 @@ In Colossal-AI, we have incorporated different implementations of mixed precisio
 3. naive amp

 | Colossal-AI    | support tensor parallel | support pipeline parallel | fp16 extent                                                                                          |
-| -------------- | ----------------------- | ------------------------- | ---------------------------------------------------------------------------------------------------- |
-| AMP_TYPE.TORCH | ✅                      | ❌                        | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
-| AMP_TYPE.APEX  | ❌                      | ❌                        | More fine-grained, we can choose opt_level O0, O1, O2, O3                                            |
-| AMP_TYPE.NAIVE | ✅                      | ✅                        | Model parameters, forward and backward operations are all downcast to fp16                           |
+|----------------|-------------------------|---------------------------|------------------------------------------------------------------------------------------------------|
+| AMP_TYPE.TORCH | ✅                       | ❌                         | Model parameters, activation, gradients are downcast to fp16 during forward and backward propagation |
+| AMP_TYPE.APEX  | ❌                       | ❌                         | More fine-grained, we can choose opt_level O0, O1, O2, O3                                            |
+| AMP_TYPE.NAIVE | ✅                       | ✅                         | Model parameters, forward and backward operations are all downcast to fp16                           |

 The first two rely on the original implementation of PyTorch (version 1.6 and above) and NVIDIA Apex.
 The last method is similar to Apex O2 level.
@@ -164,7 +164,7 @@ parser = colossalai.get_default_parser()
 args = parser.parse_args()

 # launch from torch
-colossalai.launch_from_torch(config=dict())
+colossalai.launch_from_torch()

 ```

--- a/docs/source/en/features/nvme_offload.md
+++ b/docs/source/en/features/nvme_offload.md
@@ -185,7 +185,7 @@ Then we can train GPT model with Gemini. The placement policy of Gemini should b

 ```python
 def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
    config = GPT2Config()
    with ColoInitContext(device=torch.cuda.current_device()):
        model = GPT2LMHeadModel(config)
--- a/docs/source/en/features/zero_with_chunk.md
+++ b/docs/source/en/features/zero_with_chunk.md
@@ -174,7 +174,7 @@ def main():
    SEQ_LEN = 1024
    VOCAB_SIZE = 50257
    NUM_STEPS = 10
-    colossalai.launch_from_torch(config={})
+    colossalai.launch_from_torch()

    # build criterion
    criterion = GPTLMLoss()
--- a/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_gpt_using_hybrid_parallelism.md
@@ -62,7 +62,7 @@ plugin = HybridParallelPlugin(
 ## 创建分布式环境.
 ```python
 # Launch ColossalAI
-colossalai.launch_from_torch(config={}, seed=42)
+colossalai.launch_from_torch(seed=42)
 coordinator = DistCoordinator()
 ```
 ## 定义GPT-2模型的训练组件
--- a/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
+++ b/docs/source/zh-Hans/advanced_tutorials/train_vit_with_hybrid_parallelism.md
@@ -70,7 +70,7 @@ PP_SIZE = 2
 首先我们创建一个分布式环境
 ```python
 # Launch ColossalAI
-colossalai.launch_from_torch(config={}, seed=SEEDå)
+colossalai.launch_from_torch(seed=SEEDå)
 coordinator = DistCoordinator()
 world_size = coordinator.world_size
 ```
--- a/docs/source/zh-Hans/basics/booster_api.md
+++ b/docs/source/zh-Hans/basics/booster_api.md
@@ -60,7 +60,7 @@ from colossalai.booster.plugin import TorchDDPPlugin

 def train():
    # launch colossalai
-    colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
+    colossalai.launch(rank=rank, world_size=world_size, port=port, host='localhost')

    # create plugin and objects for training
    plugin = TorchDDPPlugin()
--- a/docs/source/zh-Hans/basics/launch_colossalai.md
+++ b/docs/source/zh-Hans/basics/launch_colossalai.md
@@ -74,8 +74,7 @@ import colossalai
 args = colossalai.get_default_parser().parse_args()

 # launch distributed environment
-colossalai.launch(config=args.config,
-                  rank=args.rank,
+colossalai.launch(rank=args.rank,
                  world_size=args.world_size,
                  host=args.host,
                  port=args.port,
@@ -93,20 +92,11 @@ PyTorch自带的启动器需要在每个节点上都启动命令才能启动多
 首先，我们需要在代码里指定我们的启动方式。由于这个启动器是PyTorch启动器的封装，那么我们自然而然应该使用`colossalai.launch_from_torch`。
 分布式环境所需的参数，如 rank, world size, host 和 port 都是由 PyTorch 启动器设置的，可以直接从环境变量中读取。

-config.py
-```python
-BATCH_SIZE = 512
-LEARNING_RATE = 3e-3
-WEIGHT_DECAY = 0.3
-NUM_EPOCHS = 2
-```
 train.py
 ```python
 import colossalai

-colossalai.launch_from_torch(
-    config="./config.py",
-)
+colossalai.launch_from_torch()
 ...
 ```

@@ -186,7 +176,6 @@ colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1  --e
 import colossalai

 colossalai.launch_from_slurm(
-    config=<CONFIG>,
    host=args.host,
    port=args.port
 )
@@ -206,7 +195,6 @@ srun python train.py --host <master_node> --port 29500
 您可以在您的训练脚本中尝试以下操作。
 ```python
 colossalai.launch_from_openmpi(
-    config=<CONFIG>,
    host=args.host,
    port=args.port
 )
@@ -219,3 +207,5 @@ mpirun --hostfile <my_hostfile> -np <num_process> python train.py --host <node n

 - --hostfile: 指定一个要运行的主机列表。
 - --np: 设置总共要启动的进程（GPU）的数量。例如，如果 --np 4，4个 python 进程将被初始化以运行 train.py。
+
+<!-- doc-test-command: echo  -->
--- a/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_accumulation_with_booster.md
@@ -46,7 +46,7 @@ parser = colossalai.get_default_parser()
 args = parser.parse_args()

 # launch from torch
-colossalai.launch_from_torch(config=dict())
+colossalai.launch_from_torch()

 ```

--- a/docs/source/zh-Hans/features/gradient_clipping_with_booster.md
+++ b/docs/source/zh-Hans/features/gradient_clipping_with_booster.md
@@ -61,7 +61,7 @@ from colossalai.nn.lr_scheduler import CosineAnnealingLR
 我们需要初始化分布式环境. 为了快速演示，我们使用`launch_from_torch`. 您可以参考 [Launch Colossal-AI](../basics/launch_colossalai.md)

 ```python
-colossalai.launch_from_torch(config=dict())
+colossalai.launch_from_torch()
 logger = get_dist_logger()
 ```

--- a/docs/source/zh-Hans/features/lazy_init.md
+++ b/docs/source/zh-Hans/features/lazy_init.md
@@ -29,7 +29,7 @@ from colossalai.booster.plugin import GeminiPlugin

 from transformers import LlamaForCausalLM, LlamaConfig, BertForPreTraining

-colossalai.launch({})
+colossalai.launch()
 plugin = GeminiPlugin()
 booster = Booster(plugin)

--- a/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
+++ b/docs/source/zh-Hans/features/mixed_precision_training_with_booster.md
@@ -19,11 +19,11 @@ AMP 代表自动混合精度训练。
 2. apex.amp
 3. naive amp

-| Colossal-AI    | 支持张量并行 | 支持流水并行 | fp16 范围                                                 |
-| -------------- | ------------ | ------------ | --------------------------------------------------------- |
-| AMP_TYPE.TORCH | ✅           | ❌           | 在前向和反向传播期间，模型参数、激活和梯度向下转换至 fp16 |
-| AMP_TYPE.APEX  | ❌           | ❌           | 更细粒度，我们可以选择 opt_level O0, O1, O2, O3           |
-| AMP_TYPE.NAIVE | ✅           | ✅           | 模型参数、前向和反向操作，全都向下转换至 fp16             |
+| Colossal-AI    | 支持张量并行 | 支持流水并行 | fp16 范围                                               |
+|----------------|--------------|--------------|-------------------------------------------------------|
+| AMP_TYPE.TORCH | ✅            | ❌            | 在前向和反向传播期间，模型参数、激活和梯度向下转换至 fp16 |
+| AMP_TYPE.APEX  | ❌            | ❌            | 更细粒度，我们可以选择 opt_level O0, O1, O2, O3          |
+| AMP_TYPE.NAIVE | ✅            | ✅            | 模型参数、前向和反向操作，全都向下转换至 fp16             |

 前两个依赖于 PyTorch (1.6 及以上) 和 NVIDIA Apex 的原始实现。最后一种方法类似 Apex O2。在这些方法中，Apex-AMP 与张量并行不兼容。这是因为张量是以张量并行的方式在设备之间拆分的，因此，需要在不同的进程之间进行通信，以检查整个模型权重中是否出现 inf 或 nan。我们修改了 torch amp 实现，使其现在与张量并行兼容。

@@ -153,7 +153,7 @@ parser = colossalai.get_default_parser()
 args = parser.parse_args()

 # launch from torch
-colossalai.launch_from_torch(config=dict())
+colossalai.launch_from_torch()

 ```

--- a/docs/source/zh-Hans/features/nvme_offload.md
+++ b/docs/source/zh-Hans/features/nvme_offload.md
@@ -175,7 +175,7 @@ Mem usage: 4968.016 MB

 ```python
 def train_gemini_cpu(nvme_offload_fraction: float = 0.0):
-    colossalai.launch_from_torch({})
+    colossalai.launch_from_torch()
    config = GPT2Config()
    with ColoInitContext(device=torch.cuda.current_device()):
        model = GPT2LMHeadModel(config)
--- a/docs/source/zh-Hans/features/zero_with_chunk.md
+++ b/docs/source/zh-Hans/features/zero_with_chunk.md
@@ -174,7 +174,7 @@ def main():
    SEQ_LEN = 1024
    VOCAB_SIZE = 50257
    NUM_STEPS = 10
-    colossalai.launch_from_torch(config={})
+    colossalai.launch_from_torch()

    # build criterion
    criterion = GPTLMLoss()