[feat] Support DAPO (#6263)

* update help information * update style * fix * minor fix * support PP training * add pp support * remove unused code * address conversation * fix memory leakage support tp+pp * move empty cache * move empty cache * add DAPO support * remove format reward * fix filtering, still buggy * small fix * add DAPO support * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * tested multi-node training; fix bind_batch bug * fix conversation; support sleep mode * support reusing excessive samples * add dynamic batching control flag * add dynamic batching control flag * refactored * fix logging --------- Co-authored-by: Tong Li <tong.li35271158@gmail.com> Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
2025-09-05 19:13:01 +00:00 · 2025-04-25 17:39:17 +08:00
parent b823c6eec7
commit 26d859f68e
10 changed files with 552 additions and 359 deletions
--- a/applications/ColossalChat/coati/distributed/producer.py
+++ b/applications/ColossalChat/coati/distributed/producer.py
@@ -103,7 +103,14 @@ class BaseProducer:

                print(f"[P{self.producer_idx}] Send data {[(k, v.shape) for k, v in outputs.items()]}")
                outputs["temperature"] = torch.tensor(
-                    [self.model.generate_config.temperature] * outputs["input_ids"].size(0)
+                    [
+                        (
+                            self.model.generate_config["temperature"]
+                            if isinstance(self.model.generate_config.temperature, dict)
+                            else self.model.generate_config.temperature
+                        )
+                    ]
+                    * outputs["input_ids"].size(0)
                ).to(outputs["input_ids"].device)
                outputs = pre_send(outputs)
                ray_broadcast_tensor_dict(
@@ -113,10 +120,15 @@ class BaseProducer:
                if (i + 1) % self.num_microbatches == 0 and (
                    episode != self.num_episodes - 1 or i != num_valid_microbatches - 1
                ):
+                    if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
+                        "enable_sleep_mode", False
+                    ):
+                        self.model.llm.sleep()  # revict KV_cache to avoid OOM
                    # don't sync model for last iteration
                    print(
                        f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}"
                    )
+                    torch.cuda.empty_cache()

                    state_dict = ray_broadcast_tensor_dict(
                        None, self.num_producers, device=self.device, group_name="sync_model"
@@ -124,12 +136,21 @@ class BaseProducer:
                    self.load_state_dict(state_dict)
                    del state_dict
                    torch.cuda.empty_cache()
-                # linear annealing for 1 episode, temperature from initial to 0.7
+                    if isinstance(self.model, BACKEND_MAP["vllm"]) and self.model.model_config.get(
+                        "enable_sleep_mode", False
+                    ):
+                        self.model.llm.wake_up()
+                # linear annealing for 1 episode, temperature from initial to 0.9
                if episode <= 0:
                    ratio = 1 - (len(self.dataloader) - i) / len(self.dataloader)
-                    self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
-                        "temperature"
-                    ] + ratio * 0.7
+                    if isinstance(self.model.generate_config.temperature, dict):
+                        self.model.generate_config["temperature"] = (1 - ratio) * self.generate_config[
+                            "temperature"
+                        ] + ratio * 0.9
+                    else:
+                        self.model.generate_config.temperature = (1 - ratio) * self.generate_config[
+                            "temperature"
+                        ] + ratio * 0.9


@ray.remote