[moe] support optimizer checkpoint (#5015)

* Refactor MoE Manager setup method

* unshard optim ckpt

* optim io

* update transformer version

* update requirements

* update ckpt

* update ckpt

* update ckpt

* fix engine

* fix engine
This commit is contained in:
Xuanlei Zhao
2023-11-08 23:07:03 +08:00
committed by GitHub
parent 67f5331754
commit f71e63b0f3
20 changed files with 738 additions and 150 deletions

View File

@@ -155,9 +155,7 @@ def main():
"precision": "bf16",
"zero_stage": args.zero_stage,
}
mgr_dict = {
"seed": 42,
}
mgr_dict = {}
if args.plugin == "ep":
dp_size = dist.get_world_size()
plugin = MoeHybridParallelPlugin(

View File

@@ -41,7 +41,7 @@ def fsdp_main(rank, world_size, args):
# initialize the process group
dist.init_process_group("nccl")
MOE_MANAGER.setup(seed=42, parallel=None)
MOE_MANAGER.setup(parallel=None)
dp_size = dist.get_world_size()
dataset = RandomDataset(

View File

@@ -1,5 +1,5 @@
colossalai >= 0.3.3
torch >= 1.8.1
transformers >= 4.20.0
transformers >= 4.20.0, <= 4.34.0
sentencepiece
datasets

View File

@@ -213,9 +213,7 @@ def main():
"precision": args.precision,
"zero_stage": args.zero_stage,
}
mgr_dict = {
"seed": 42,
}
mgr_dict = {}
if args.plugin == "ep":
dp_size = dist.get_world_size()
plugin = MoeHybridParallelPlugin(