From 3454b1088441fc27ce5cdad19a6f5c5c40dd14f8 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Wed, 30 Apr 2025 11:33:23 +0800 Subject: [PATCH] update files --- .../ColossalChat/coati/distributed/producer.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/applications/ColossalChat/coati/distributed/producer.py b/applications/ColossalChat/coati/distributed/producer.py index 945bb05dc..97e9a4183 100644 --- a/applications/ColossalChat/coati/distributed/producer.py +++ b/applications/ColossalChat/coati/distributed/producer.py @@ -132,19 +132,21 @@ class BaseProducer: ): self.model.llm.sleep() # revict KV_cache to avoid OOM # don't sync model for last iteration - print( - f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}" - ) torch.cuda.empty_cache() if self.consumer_pp_size > 1: for i in range(self.consumer_pp_size): - print(f"[P{self.producer_idx}] Sync model PP stage {i}") + print( + f"[P{self.producer_idx}] Sync model PP stage {i} episode {episode} step {(i + 1) // self.num_microbatches - 1}" + ) state_dict = ray_broadcast_tensor_dict( None, self.num_producers, device=self.device, group_name=f"sync_model_{i}" ) self.load_state_dict(state_dict) else: + print( + f"[P{self.producer_idx}] Sync model episode {episode} step {(i + 1) // self.num_microbatches - 1}" + ) state_dict = ray_broadcast_tensor_dict( None, self.num_producers, device=self.device, group_name="sync_model" )