mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2025-09-05 19:13:01 +00:00
Add GRPO and Support RLVR for PPO (#6186)
* add grpo, support rlvr * add grpo, support rlvr * tested deepseek r1 pipeline * add ci * verify grpo r1 * verify grpo r1 * update readme, remove unused code * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * remove path * clean code * fix circular import * fix ci OOM * fix ci OOM * skip kto tp, fix qwen generation --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
This commit is contained in:
@@ -3,13 +3,13 @@ PPO trainer
|
||||
"""
|
||||
|
||||
import os
|
||||
from typing import Dict, List, Optional
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
import torch
|
||||
import wandb
|
||||
from coati.experience_buffer import NaiveExperienceBuffer
|
||||
from coati.experience_maker import Experience, NaiveExperienceMaker
|
||||
from coati.models import Critic, RewardModel
|
||||
from coati.models import Critic, RewardModel, RLVRRewardModel
|
||||
from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
|
||||
from coati.models.utils import calc_action_log_probs
|
||||
from coati.trainer.callbacks import Callback
|
||||
@@ -84,7 +84,7 @@ class PPOTrainer(OLTrainer):
|
||||
critic_booster: Booster,
|
||||
actor: PreTrainedModel,
|
||||
critic: Critic,
|
||||
reward_model: RewardModel,
|
||||
reward_model: Union[RewardModel, RLVRRewardModel],
|
||||
initial_model: PreTrainedModel,
|
||||
actor_optim: Optimizer,
|
||||
critic_optim: Optimizer,
|
||||
@@ -210,6 +210,7 @@ class PPOTrainer(OLTrainer):
|
||||
return self.experience_maker.make_experience(
|
||||
input_ids=prompts["input_ids"].to(get_current_device()),
|
||||
attention_mask=prompts["attention_mask"].to(get_current_device()),
|
||||
gt_answer=prompts["gt_answer"],
|
||||
**self.generate_kwargs,
|
||||
)
|
||||
|
||||
|
Reference in New Issue
Block a user