[zero] solve hang

This commit is contained in:
hxwang
2024-07-05 07:19:37 +00:00
committed by Hongxin Liu
parent 0fad23c691
commit 46c069b0db
12 changed files with 113 additions and 390 deletions

View File

@@ -141,7 +141,6 @@ def check_moe_checkpoint(test_config):
if dist.get_rank() == 0:
saved_model = model_cls.from_pretrained(model_dir).cuda()
check_model_equal(orig_model, saved_model)
# check_model_equal(model, saved_model)
saved_model.save_pretrained(hf_model_dir)
dist.barrier()
# check load model