[feat] Sync shard model (#6289)

* [feat] support hybrid parallel model sync * update consumer and producer * update files * update producer * remove print * update --------- Co-authored-by: duanjunwen <935724073@qq.com> Co-authored-by: YeAnbang <44796419+YeAnbang@users.noreply.github.com> Co-authored-by: Tong Li <tong.li35271158@gmail.com>
2025-11-13 06:11:09 +00:00 · 2025-04-30 14:47:01 +08:00
parent 14f237ce7e
commit 5fd4bcb9d8
4 changed files with 66 additions and 20 deletions
--- a/applications/ColossalChat/rl_example.py
+++ b/applications/ColossalChat/rl_example.py
@@ -58,7 +58,7 @@ if __name__ == "__main__":
        "--master_address", type=str, default=None, help="Master address for multi-node distributed training, Optional"
    )
    parser.add_argument(
-        "--master_port", type=int, default=29506, help="Master port for multi-node distributed training, Optional"
+        "--master_port", type=int, default=29505, help="Master port for multi-node distributed training, Optional"
    )

    # Sampling parameters
@@ -223,7 +223,7 @@ if __name__ == "__main__":
            "zero_stage": 2,
        },  # for zero
        # plugin_config={
-        #     "tp_size": 2,
+        #     "tp_size": 1,
        #     "pp_size": 2,
        #     "microbatch_size": max(
        #         1, args.train_microbatch_size // 2