From 56e4e741403c71b47c94a7c857d3480d154dd2c0 Mon Sep 17 00:00:00 2001 From: Tong Li Date: Wed, 23 Apr 2025 17:20:09 +0800 Subject: [PATCH] boxed version --- applications/ColossalChat/coati/dataset/loader.py | 2 +- .../ColossalChat/coati/distributed/reward/reward_fn.py | 10 +++++----- .../coati/distributed/reward/reward_utils.py | 3 ++- applications/ColossalChat/rl_example.py | 6 +++--- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/applications/ColossalChat/coati/dataset/loader.py b/applications/ColossalChat/coati/dataset/loader.py index 43cf78383..f514c3b84 100755 --- a/applications/ColossalChat/coati/dataset/loader.py +++ b/applications/ColossalChat/coati/dataset/loader.py @@ -375,7 +375,7 @@ def apply_chat_template_and_mask( tokens = [] assistant_mask = [] for i, msg in enumerate(chat): - msg_tokens = tokenizer.apply_chat_template([system_element, msg], tokenize=True, add_generation_prompt=True) + msg_tokens = tokenizer.apply_chat_template([msg], tokenize=True, add_generation_prompt=True) # remove unexpected bos token if i > 0 and msg_tokens[0] == tokenizer.bos_token_id: msg_tokens = msg_tokens[1:] diff --git a/applications/ColossalChat/coati/distributed/reward/reward_fn.py b/applications/ColossalChat/coati/distributed/reward/reward_fn.py index 53bc15e25..32240f6b3 100644 --- a/applications/ColossalChat/coati/distributed/reward/reward_fn.py +++ b/applications/ColossalChat/coati/distributed/reward/reward_fn.py @@ -18,12 +18,12 @@ def math_reward_fn(input_ids, gt_answer, response_idx, **kwargs): gt_answer = tokenizer.decode(gt_answer.squeeze(0), skip_special_tokens=True) final_answer, processed_str = extract_solution(decoded_final_answer) - format_valid = validate_response_structure(processed_str, kwargs["tags"]) + # format_valid = validate_response_structure(processed_str, kwargs["tags"]) - # Check format accuracy - if format_valid: - format_reward += format_score - reward += format_score + # # Check format accuracy + # if format_valid: + # format_reward += format_score + # reward += format_score # Check answer accuracy if ( diff --git a/applications/ColossalChat/coati/distributed/reward/reward_utils.py b/applications/ColossalChat/coati/distributed/reward/reward_utils.py index c1e73d4b9..6f0143a24 100644 --- a/applications/ColossalChat/coati/distributed/reward/reward_utils.py +++ b/applications/ColossalChat/coati/distributed/reward/reward_utils.py @@ -66,7 +66,8 @@ def extract_solution(solution_str: str) -> Tuple[Optional[str], str]: """ # Extract final answer using XML-style tags - answer_pattern = r"(.*?)" + # answer_pattern = r"(.*?)" + answer_pattern = r"boxed{(.*?)}" matches = list(re.finditer(answer_pattern, solution_str, re.DOTALL)) if not matches: diff --git a/applications/ColossalChat/rl_example.py b/applications/ColossalChat/rl_example.py index f42a660b7..5b17eb32a 100644 --- a/applications/ColossalChat/rl_example.py +++ b/applications/ColossalChat/rl_example.py @@ -44,7 +44,7 @@ if __name__ == "__main__": "-tmbs", "--train-microbatch-size", type=int, - default=2, + default=1, help="Effective batch size per dp group for forwarding and backwarding. Please select based on the availiable memory.", ) parser.add_argument("-b", "--backend", type=str, default="transformers", choices=["transformers", "vllm"]) @@ -84,7 +84,7 @@ if __name__ == "__main__": inference_model_config.update(dict(gpu_memory_utilization=0.7, enforce_eager=True, enable_chunked_prefill=True)) generate_config.update( dict( - max_tokens=2048, + max_tokens=4096, ignore_eos=True, include_stop_str_in_output=True, stop=[""], @@ -107,7 +107,7 @@ if __name__ == "__main__": num_producers=args.num_inferencer, num_proc_per_producer=1, num_consumer_procs=args.num_trainers, - num_episodes=1, + num_episodes=2, inference_batch_size=args.inference_batch_size, inference_microbatch_size=args.inference_microbatch_size, train_batch_size=args.train_batch_size,