[misc] refactor launch API and tensor constructor (#5666)

* [misc] remove config arg from initialize

* [misc] remove old tensor contrusctor

* [plugin] add npu support for ddp

* [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci

* [devops] fix doc test ci

* [test] fix test launch

* [doc] update launch doc

---------

Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
Hongxin Liu
2024-04-29 10:40:11 +08:00
committed by GitHub
parent 91fa553775
commit 7f8b16635b
223 changed files with 294 additions and 403 deletions

View File

@@ -94,8 +94,7 @@ def train_gpt(args):
def run(rank, world_size, port, args):
config = {}
colossalai.launch(config=config, rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
colossalai.launch(rank=rank, world_size=world_size, host="localhost", port=port, backend="nccl")
train_gpt(args)

View File

@@ -47,7 +47,7 @@ def get_data(batch_size, seq_len, vocab_size):
def main():
disable_existing_loggers()
launch_from_torch(config={})
launch_from_torch()
logger = get_dist_logger()
config = transformers.GPT2Config(n_position=SEQ_LENGTH, n_layer=NUM_LAYERS, n_head=NUM_HEADS, n_embd=HIDDEN_DIM)
if FP16:

View File

@@ -132,7 +132,7 @@ def main():
PROF_FLAG = False # The flag of profiling, False by default
disable_existing_loggers()
colossalai.launch_from_torch(config={})
colossalai.launch_from_torch()
logger = get_dist_logger()
logger.info(f"{args.model_type}, {args.distplan}, batch size {BATCH_SIZE}", ranks=[0])

View File

@@ -67,7 +67,7 @@ def main():
parser.add_argument("--cpu_offload", action="store_true", help="Use gradient checkpointing")
args = parser.parse_args()
colossalai.launch_from_torch({})
colossalai.launch_from_torch()
coordinator = DistCoordinator()
def empty_init():

View File

@@ -196,7 +196,7 @@ def main():
# ==============================
# Launch Distributed Environment
# ==============================
colossalai.launch_from_torch(config={}, seed=42)
colossalai.launch_from_torch(seed=42)
coordinator = DistCoordinator()
# local_batch_size = BATCH_SIZE // coordinator.world_size

View File

@@ -36,9 +36,9 @@ def main():
args = parser.parse_args()
disable_existing_loggers()
if args.from_torch:
colossalai.launch_from_torch(config=args.config)
colossalai.launch_from_torch()
else:
colossalai.launch_from_slurm(config=args.config, host=args.host, port=29500, seed=42)
colossalai.launch_from_slurm(host=args.host, port=29500, seed=42)
logger = get_dist_logger()
data_path = None if args.use_dummy_dataset else os.environ["DATA"]