mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-21 09:29:47 +00:00
[misc] refactor launch API and tensor constructor (#5666)
* [misc] remove config arg from initialize * [misc] remove old tensor contrusctor * [plugin] add npu support for ddp * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * [devops] fix doc test ci * [test] fix test launch * [doc] update launch doc --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -60,7 +60,7 @@ from colossalai.booster.plugin import TorchDDPPlugin
|
||||
|
||||
def train():
|
||||
# launch colossalai
|
||||
colossalai.launch(config=dict(), rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
colossalai.launch(rank=rank, world_size=world_size, port=port, host='localhost')
|
||||
|
||||
# create plugin and objects for training
|
||||
plugin = TorchDDPPlugin()
|
||||
|
@@ -74,8 +74,7 @@ import colossalai
|
||||
args = colossalai.get_default_parser().parse_args()
|
||||
|
||||
# launch distributed environment
|
||||
colossalai.launch(config=args.config,
|
||||
rank=args.rank,
|
||||
colossalai.launch(rank=args.rank,
|
||||
world_size=args.world_size,
|
||||
host=args.host,
|
||||
port=args.port,
|
||||
@@ -93,20 +92,11 @@ PyTorch自带的启动器需要在每个节点上都启动命令才能启动多
|
||||
首先,我们需要在代码里指定我们的启动方式。由于这个启动器是PyTorch启动器的封装,那么我们自然而然应该使用`colossalai.launch_from_torch`。
|
||||
分布式环境所需的参数,如 rank, world size, host 和 port 都是由 PyTorch 启动器设置的,可以直接从环境变量中读取。
|
||||
|
||||
config.py
|
||||
```python
|
||||
BATCH_SIZE = 512
|
||||
LEARNING_RATE = 3e-3
|
||||
WEIGHT_DECAY = 0.3
|
||||
NUM_EPOCHS = 2
|
||||
```
|
||||
train.py
|
||||
```python
|
||||
import colossalai
|
||||
|
||||
colossalai.launch_from_torch(
|
||||
config="./config.py",
|
||||
)
|
||||
colossalai.launch_from_torch()
|
||||
...
|
||||
```
|
||||
|
||||
@@ -186,7 +176,6 @@ colossalai run --nproc_per_node 4 --hostfile ./hostfile --master_addr host1 --e
|
||||
import colossalai
|
||||
|
||||
colossalai.launch_from_slurm(
|
||||
config=<CONFIG>,
|
||||
host=args.host,
|
||||
port=args.port
|
||||
)
|
||||
@@ -206,7 +195,6 @@ srun python train.py --host <master_node> --port 29500
|
||||
您可以在您的训练脚本中尝试以下操作。
|
||||
```python
|
||||
colossalai.launch_from_openmpi(
|
||||
config=<CONFIG>,
|
||||
host=args.host,
|
||||
port=args.port
|
||||
)
|
||||
@@ -219,3 +207,5 @@ mpirun --hostfile <my_hostfile> -np <num_process> python train.py --host <node n
|
||||
|
||||
- --hostfile: 指定一个要运行的主机列表。
|
||||
- --np: 设置总共要启动的进程(GPU)的数量。例如,如果 --np 4,4个 python 进程将被初始化以运行 train.py。
|
||||
|
||||
<!-- doc-test-command: echo -->
|
||||
|
Reference in New Issue
Block a user