mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-01 17:17:05 +00:00
[moe] support optimizer checkpoint (#5015)
* Refactor MoE Manager setup method * unshard optim ckpt * optim io * update transformer version * update requirements * update ckpt * update ckpt * update ckpt * fix engine * fix engine
This commit is contained in:
@@ -155,9 +155,7 @@ def main():
|
||||
"precision": "bf16",
|
||||
"zero_stage": args.zero_stage,
|
||||
}
|
||||
mgr_dict = {
|
||||
"seed": 42,
|
||||
}
|
||||
mgr_dict = {}
|
||||
if args.plugin == "ep":
|
||||
dp_size = dist.get_world_size()
|
||||
plugin = MoeHybridParallelPlugin(
|
||||
|
@@ -41,7 +41,7 @@ def fsdp_main(rank, world_size, args):
|
||||
# initialize the process group
|
||||
dist.init_process_group("nccl")
|
||||
|
||||
MOE_MANAGER.setup(seed=42, parallel=None)
|
||||
MOE_MANAGER.setup(parallel=None)
|
||||
|
||||
dp_size = dist.get_world_size()
|
||||
dataset = RandomDataset(
|
||||
|
@@ -1,5 +1,5 @@
|
||||
colossalai >= 0.3.3
|
||||
torch >= 1.8.1
|
||||
transformers >= 4.20.0
|
||||
transformers >= 4.20.0, <= 4.34.0
|
||||
sentencepiece
|
||||
datasets
|
||||
|
@@ -213,9 +213,7 @@ def main():
|
||||
"precision": args.precision,
|
||||
"zero_stage": args.zero_stage,
|
||||
}
|
||||
mgr_dict = {
|
||||
"seed": 42,
|
||||
}
|
||||
mgr_dict = {}
|
||||
if args.plugin == "ep":
|
||||
dp_size = dist.get_world_size()
|
||||
plugin = MoeHybridParallelPlugin(
|
||||
|
Reference in New Issue
Block a user