[ColossalChat] Update RLHF V2 (#5286)

* Add dpo. Fix sft, ppo, lora. Refactor all * fix and tested ppo * 2 nd round refactor * add ci tests * fix ci * fix ci * fix readme, style * fix readme style * fix style, fix benchmark * reproduce benchmark result, remove useless files * rename to ColossalChat * use new image * fix ci workflow * fix ci * use local model/tokenizer for ci tests * fix ci * fix ci * fix ci * fix ci timeout * fix rm progress bar. fix ci timeout * fix ci * fix ci typo * remove 3d plugin from ci temporary * test environment * cannot save optimizer * support chat template * fix readme * fix path * test ci locally * restore build_or_pr * fix ci data path * fix benchmark * fix ci, move ci tests to 3080, disable fast tokenizer * move ci to 85 * support flash attention 2 * add all-in-one data preparation script. Fix colossal-llama2-chat chat template * add hardware requirements * move ci test data * fix save_model, add unwrap * fix missing bos * fix missing bos; support grad accumulation with gemini * fix ci * fix ci * fix ci * fix llama2 chat template config * debug sft * debug sft * fix colossalai version requirement * fix ci * add sanity check to prevent NaN loss * fix requirements * add dummy data generation script * add dummy data generation script * add dummy data generation script * add dummy data generation script * update readme * update readme * update readme and ignore * fix logger bug * support parallel_output * modify data preparation logic * fix tokenization * update lr * fix inference * run pre-commit --------- Co-authored-by: Tong Li <tong.li352711588@gmail.com>
2025-09-16 14:41:53 +00:00 · 2024-03-29 14:12:29 +08:00
parent 36c4bb2893
commit df5e9c53cf
200 changed files with 8848 additions and 8049 deletions
--- a/applications/ColossalChat/coati/utils/init.py
+++ b/applications/ColossalChat/coati/utils/init.py
@@ -0,0 +1,4 @@
+from .accumulative_meter import AccumulativeMeanMeter
+from .ckpt_io import load_checkpoint, save_checkpoint
+
+__all__ = ["load_checkpoint", "save_checkpoint", "AccumulativeMeanMeter"]
--- a/applications/ColossalChat/coati/utils/accumulative_meter.py
+++ b/applications/ColossalChat/coati/utils/accumulative_meter.py
@@ -0,0 +1,69 @@
+"""
+A class that can be used to calculate the mean of a variable
+"""
+
+
+class AccumulativeMeanVariable:
+    """
+    A class that calculates the accumulative mean of a variable.
+    """
+
+    def __init__(self):
+        self._sum = 0
+        self._count = 0
+
+    def add(self, value, count_update=1):
+        """
+        Adds a value to the sum and updates the count.
+
+        Args:
+            value (float): The value to be added.
+            count_update (int, optional): The amount to update the count by. Defaults to 1.
+        """
+        self._sum += value
+        self._count += count_update
+
+    def get(self):
+        """
+        Calculates and returns the accumulative mean.
+
+        Returns:
+            float: The accumulative mean.
+        """
+        return self._sum / self._count if self._count > 0 else 0
+
+    def reset(self):
+        """
+        Resets the sum and count to zero.
+        """
+        self._sum = 0
+        self._count = 0
+
+
+class AccumulativeMeanMeter:
+    """
+    A class for calculating and storing the accumulative mean of variables.
+
+    Attributes:
+        variable_dict (dict): A dictionary to store the accumulative mean variables.
+
+    Methods:
+        add(name, value, count_update=1): Adds a value to the specified variable.
+        get(name): Retrieves the accumulative mean value of the specified variable.
+        reset(): Resets all the accumulative mean variables to their initial state.
+    """
+
+    def __init__(self):
+        self.variable_dict = {}
+
+    def add(self, name, value, count_update=1):
+        if name not in self.variable_dict:
+            self.variable_dict[name] = AccumulativeMeanVariable()
+        self.variable_dict[name].add(value, count_update=count_update)
+
+    def get(self, name):
+        return self.variable_dict[name].get()
+
+    def reset(self):
+        for name in self.variable_dict:
+            self.variable_dict[name].reset()
--- a/applications/ColossalChat/coati/utils/ckpt_io.py
+++ b/applications/ColossalChat/coati/utils/ckpt_io.py
@@ -0,0 +1,93 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+"""
+Helper functions for IO save load checkpoints
+"""
+
+import json
+import os
+from typing import Any, Dict, Tuple, Union
+
+import torch
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.optimizer import Optimizer
+
+from colossalai.booster import Booster
+from colossalai.cluster import DistCoordinator
+
+
+def load_json(file_path: Union[str, os.PathLike]) -> Dict[str, Any]:
+    """
+    Load file in JSON format
+    """
+    with open(file=file_path, mode="r", encoding="utf-8") as fp:
+        return json.load(fp)
+
+
+def save_json(data: Dict[str, Any], file_path: Union[str, os.PathLike]) -> None:
+    """
+    Save as JSON format
+    """
+    with open(file=file_path, mode="w", encoding="utf-8") as fp:
+        json.dump(data, fp=fp, ensure_ascii=False, indent=4)
+
+
+def save_checkpoint(
+    save_dir: Union[str, os.PathLike],
+    booster: Booster,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: _LRScheduler,
+    epoch: int,
+    step: int,
+    batch_size: int,
+    coordinator: DistCoordinator,
+) -> None:
+    """
+    Save model checkpoint, optimizer, LR scheduler and intermedidate running states.
+    """
+
+    save_dir = os.path.join(save_dir, f"epoch-{epoch}_step-{step}")
+    os.makedirs(os.path.join(save_dir, "modeling"), exist_ok=True)
+
+    booster.save_model(model, os.path.join(save_dir, "modeling"), shard=True)
+
+    """
+    Temporary disable the following as save_optimizer causes all processes to hang in a multi-gpu environment,
+    working on fixing this bug
+    """
+
+    booster.save_optimizer(optimizer, os.path.join(save_dir, "optimizer"), shard=True)
+    booster.save_lr_scheduler(lr_scheduler, os.path.join(save_dir, "lr_scheduler"))
+    running_states = {
+        "epoch": epoch,
+        "step": step,
+        "sample_start_index": step * batch_size,
+    }
+    if coordinator.is_master():
+        save_json(running_states, os.path.join(save_dir, "running_states.json"))
+
+
+def load_checkpoint(
+    load_dir: Union[str, os.PathLike],
+    booster: Booster,
+    model: torch.nn.Module,
+    optimizer: Optimizer,
+    lr_scheduler: _LRScheduler,
+) -> Tuple[int, int, int]:
+    """
+    Load model checkpoint, optimizer, LR scheduler and intermedidate running states.
+    """
+
+    # Update booster params states.
+    booster.load_model(model=model, checkpoint=os.path.join(load_dir, "modeling"))
+    booster.load_optimizer(optimizer=optimizer, checkpoint=os.path.join(load_dir, "optimizer"))
+    booster.load_lr_scheduler(lr_scheduler=lr_scheduler, checkpoint=os.path.join(load_dir, "lr_scheduler"))
+
+    running_states = load_json(file_path=os.path.join(load_dir, "running_states.json"))
+    return (
+        running_states["epoch"],
+        running_states["step"],
+        running_states["sample_start_index"],
+    )