[ci] update ci (#6254 )

* fix for async io * test for upgrading transformers * add ci machine * fix * fix * fix * fix * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * Update test_fp16_torch.py * Update build_on_pr.yml * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fix * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci * fix * fiux * fix * fix * fix --------- Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Update README.md (#6268 )
2025-05-10 17:38:32 +00:00 · 2025-04-18 16:40:53 +08:00 · 2025-04-17 12:07:25 +08:00 · 2025-03-07 14:14:26 +08:00 · 2025-03-03 16:15:09 +08:00 · 2025-03-01 19:04:14 +08:00
1396 changed files with 108515 additions and 33557 deletions
--- a/.compatibility
+++ b/.compatibility
@ -1,3 +1,3 @@
-1.12.0-11.3.0
-1.13.0-11.6.0
-2.0.0-11.7.0
+2.3.0-12.1.0
+2.4.0-12.4.1
+2.5.1-12.4.1
--- a/.cuda_ext.json
+++ b/.cuda_ext.json
@ -1,16 +1,12 @@
 {
  "build": [
    {
-      "torch_command": "pip install torch==1.12.1+cu102 torchvision==0.13.1+cu102 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu102",
-      "cuda_image": "hpcaitech/cuda-conda:10.2"
+      "torch_command": "pip install torch==2.3.0 torchvision==0.18.0 torchaudio==2.3.0 --index-url https://download.pytorch.org/whl/cu121",
+      "cuda_image": "hpcaitech/cuda-conda:12.1"
    },
    {
-      "torch_command": "pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113",
-      "cuda_image": "hpcaitech/cuda-conda:11.3"
-    },
-    {
-      "torch_command": "pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu116",
-      "cuda_image": "hpcaitech/cuda-conda:11.6"
+      "torch_command": "pip install torch==2.5.1 torchvision==0.20.1 torchaudio==2.5.1 --index-url https://download.pytorch.org/whl/cu124",
+      "cuda_image": "hpcaitech/cuda-conda:12.4"
    }
  ]
 }
--- a/.github/CODEOWNERS
+++ b/.github/CODEOWNERS
@ -0,0 +1 @@
+*   @hpcaitech/colossalai-qa
--- a/.github/ISSUE_TEMPLATE/bug-report.yml
+++ b/.github/ISSUE_TEMPLATE/bug-report.yml
@ -8,6 +8,33 @@ body:
  attributes:
    value: >
      #### Not suitable for your needs? [Open a blank issue](https://github.com/hpcaitech/ColossalAI/issues/new).
+- type: checkboxes
+  attributes:
+    label: Is there an existing issue for this bug?
+    description: Please search [here](https://github.com/hpcaitech/ColossalAI/issues) to see if an open or closed issue already exists for the bug you have encountered.
+    options:
+    - label: I have searched the existing issues
+      required: true
+
+- type: checkboxes
+  attributes:
+    label: The bug has not been fixed in the latest main branch
+    options:
+    - label: I have checked the latest main branch
+      required: true
+
+- type: dropdown
+  id: share_script
+  attributes:
+    label: Do you feel comfortable sharing a concise (minimal) script that reproduces the error? :)
+    description: If not, please share your setting/training config, and/or point to the line in the repo that throws the error.
+              If the issue is not easily reproducible by us, it will reduce the likelihood of getting responses.
+    options:
+      - Yes, I will share a minimal reproducible script.
+      - No, I prefer not to share.
+  validations:
+    required: true
+
 - type: textarea
  attributes:
    label: 🐛 Describe the bug
--- a/.github/pull_request_template.md
+++ b/.github/pull_request_template.md
@ -3,6 +3,7 @@
 - [ ] I have created an issue for this PR for traceability
 - [ ] The title follows the standard format: `[doc/gemini/tensor/...]: A concise description`
 - [ ] I have added relevant tags if possible for us to better distinguish different PRs
+- [ ] I have installed pre-commit: `pip install pre-commit && pre-commit install`


 ## 🚨 Issue number
--- a/.github/workflows/build_on_pr.yml
+++ b/.github/workflows/build_on_pr.yml
@ -2,7 +2,7 @@ name: Build on PR

 on:
  pull_request:
-    types: [synchronize, opened, reopened, ready_for_review, closed, edited]
+    types: [synchronize, opened, reopened, ready_for_review, closed]
    branches:
      - "main"
      - "develop"
@ -22,57 +22,6 @@ on:
  delete:

 jobs:
-  prepare_cache:
-    name: Prepare testmon cache
-    if: |
-      github.event_name == 'create' &&
-      github.event.ref_type == 'branch' &&
-      github.event.repository.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Copy testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export REF_BRANCH=$(echo ${{ github.event.ref }} | sed "s/\// /")
-          if [ -d /github/home/testmon_cache/${MAIN_BRANCH} ]; then
-             cp -p -r /github/home/testmon_cache/${MAIN_BRANCH} "/github/home/testmon_cache/${REF_BRANCH}"
-          fi
-        env:
-          MAIN_BRANCH: ${{ github.event.master_branch }}
-
-  prepare_cache_for_pr:
-    name: Prepare testmon cache for PR
-    if: |
-      github.event_name == 'pull_request' &&
-      (github.event.action == 'opened' || github.event.action == 'reopened' || (github.event.action == 'edited' && github.event.changes.base != null)) &&
-      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    concurrency:
-      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-repare-cache
-      cancel-in-progress: true
-    steps:
-      - name: Copy testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d "/github/home/testmon_cache/${BASE}" ] && [ ! -z "$(ls -A "/github/home/testmon_cache/${BASE}")" ]; then
-            mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER} && cp -p -r "/github/home/testmon_cache/${BASE}"/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
  detect:
    name: Detect file change
    if: |
@ -138,11 +87,11 @@ jobs:
    name: Build and Test Colossal-AI
    needs: detect
    if: needs.detect.outputs.anyLibraryFileChanged == 'true'
-    runs-on: [self-hosted, gpu]
+    runs-on: ubuntu-latest
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
-    timeout-minutes: 60
+      image: image-cloud.luchentech.com/hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      options: --gpus all --shm-size=2g --rm -v /dev/shm -v /data/scratch:/data/scratch
+    timeout-minutes: 90
    defaults:
      run:
        shell: bash
@ -168,12 +117,13 @@ jobs:
          cd TensorNVMe
          conda install cmake
          pip install -r requirements.txt
-          pip install -v .
+          DISABLE_URING=1 pip install -v --no-cache-dir .

      - name: Store TensorNVMe Cache
        run: |
          cd TensorNVMe
          cp -p -r ./build /github/home/tensornvme_cache/
+          cp -p -r ./cmake-build /github/home/tensornvme_cache/

      - name: Checkout Colossal-AI
        uses: actions/checkout@v2
@ -190,38 +140,33 @@ jobs:

      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v -e .
-          pip install -r requirements/requirements-test.txt
+          BUILD_EXT=1 pip install -v -e .
+          pip install --no-cache-dir -r requirements/requirements-test.txt

      - name: Store Colossal-AI Cache
        run: |
          # -p flag is required to preserve the file timestamp to avoid ninja rebuild
          cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/

-      - name: Restore Testmon Cache
-        run: |
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
-            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* /__w/ColossalAI/ColossalAI/
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.number }}
-
      - name: Execute Unit Testing
        run: |
-          CURL_CA_BUNDLE="" PYTHONPATH=$PWD pytest -m "not largedist" --testmon --testmon-forceselect --testmon-cov=. --durations=10 tests/
+          CURL_CA_BUNDLE="" PYTHONPATH=$PWD FAST_TEST=1 pytest \
+          -m "not largedist" \
+          --durations=0 \
+          --ignore tests/test_analyzer \
+          --ignore tests/test_auto_parallel \
+          --ignore tests/test_fx \
+          --ignore tests/test_autochunk \
+          --ignore tests/test_gptq \
+          --ignore tests/test_infer_ops \
+          --ignore tests/test_legacy \
+          --ignore tests/test_smoothquant \
+          tests/
        env:
-          DATA: /data/scratch/cifar-10
-          NCCL_SHM_DISABLE: 1
          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
-          TESTMON_CORE_PKGS: /__w/ColossalAI/ColossalAI/requirements/requirements.txt,/__w/ColossalAI/ColossalAI/requirements/requirements-test.txt
          LLAMA_PATH: /data/scratch/llama-tiny
-
-      - name: Store Testmon Cache
-        run: |
-          mkdir -p /github/home/testmon_cache/_pull/${PR_NUMBER}
-          cp -p -r /__w/ColossalAI/ColossalAI/.testmondata* /github/home/testmon_cache/_pull/${PR_NUMBER}/
-        env:
-          PR_NUMBER: ${{ github.event.number }}
+          MOE_TENSOR_PATH: /data/scratch/moe_tensors
+          HF_ENDPOINT: https://hf-mirror.com

      - name: Collate artifact
        env:
@ -255,58 +200,7 @@ jobs:
          fi

      - name: Upload test coverage artifact
-        uses: actions/upload-artifact@v3
+        uses: actions/upload-artifact@v4
        with:
          name: report
          path: report/
-
-  store_cache:
-    name: Store testmon cache for PR
-    if: |
-      github.event_name == 'pull_request' &&
-      github.event.action == 'closed' &&
-      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Store testmon cache if possible
-        if: github.event.pull_request.merged == true
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.pull_request.base.ref }} | sed "s/\// /")
-          if [ -d /github/home/testmon_cache/_pull/${PR_NUMBER} ] && [ ! -z "$(ls -A /github/home/testmon_cache/_pull/${PR_NUMBER})" ]; then
-            cp -p -r /github/home/testmon_cache/_pull/${PR_NUMBER}/.testmondata* "/github/home/testmon_cache/${BASE}/"
-          fi
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-      - name: Remove testmon cache
-        run: |
-          rm -rf /github/home/testmon_cache/_pull/${PR_NUMBER}
-        env:
-          PR_NUMBER: ${{ github.event.pull_request.number }}
-
-  remove_cache:
-    name: Remove testmon cache
-    if: |
-      github.event_name == 'delete' &&
-      github.event.ref_type == 'branch' &&
-      github.event.repository.full_name == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, gpu]
-    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --rm
-    timeout-minutes: 5
-    defaults:
-      run:
-        shell: bash
-    steps:
-      - name: Remove testmon cache
-        run: | # branch name may contain slash, we need to replace it with space
-          export BASE=$(echo ${{ github.event.ref }} | sed "s/\// /")
-          rm -rf "/github/home/testmon_cache/${BASE}"
--- a/.github/workflows/build_on_schedule.yml
+++ b/.github/workflows/build_on_schedule.yml
@ -10,20 +10,22 @@ jobs:
  build:
    name: Build and Test Colossal-AI
    if: github.repository == 'hpcaitech/ColossalAI'
-    runs-on: [self-hosted, 8-gpu]
+    runs-on: [self-hosted, gpu]
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
-    timeout-minutes: 40
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
+    timeout-minutes: 90
    steps:
      - name: Check GPU Availability # ensure all GPUs have enough memory
        id: check-avai
        run: |
          avai=true
-          for i in $(seq 0 7);
+          ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+          endIndex=$(($ngpu-1))
+          for i in $(seq 0 $endIndex);
          do
            gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
-            [ "$gpu_used" -gt "10000" ] && avai=false
+            [ "$gpu_used" -gt "2000" ] && avai=false
          done

          echo "GPU is available: $avai"
@ -42,7 +44,7 @@ jobs:
          cd TensorNVMe
          conda install cmake
          pip install -r requirements.txt
-          pip install -v .
+          DISABLE_URING=1 pip install -v .

      - uses: actions/checkout@v2
        if: steps.check-avai.outputs.avai == 'true'
@ -53,25 +55,29 @@ jobs:
        if: steps.check-avai.outputs.avai == 'true'
        run: |
          [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
-          CUDA_EXT=1 pip install -v -e .
+          BUILD_EXT=1 pip install -v -e .
          cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
-          pip install -r requirements/requirements-test.txt
+          pip install --no-cache-dir -r requirements/requirements-test.txt

      - name: Unit Testing
        if: steps.check-avai.outputs.avai == 'true'
        run: |
-          PYTHONPATH=$PWD pytest --durations=0 tests
+          PYTHONPATH=$PWD pytest \
+          -m "not largedist" \
+          --durations=0 \
+          tests/
        env:
-          DATA: /data/scratch/cifar-10
          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
          LLAMA_PATH: /data/scratch/llama-tiny
+          MOE_TENSOR_PATH: /data/scratch/moe_tensors
+          HF_ENDPOINT: https://hf-mirror.com

      - name: Notify Lark
        id: message-preparation
        if: ${{ failure() }}
        run: |
          url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
-          msg="Scheduled Build and Test failed on 8 GPUs, please visit $url for details"
+          msg="Scheduled Build and Test failed, please visit $url for details"
          echo $msg
          python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
        env:
--- a/.github/workflows/compatiblity_test_on_dispatch.yml
+++ b/.github/workflows/compatiblity_test_on_dispatch.yml
@ -50,46 +50,33 @@ jobs:
      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
    container:
      image: ${{ matrix.container }}
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
-    timeout-minutes: 120
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
+    timeout-minutes: 200
    steps:
      - name: Install dependencies
        run: |
-          pip install -U pip setuptools wheel --user
-      - uses: actions/checkout@v2
-        with:
-          repository: hpcaitech/TensorNVMe
-          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-          path: TensorNVMe
-      - name: Install tensornvme
-        run: |
-          cd TensorNVMe
          apt update && apt install -y cmake
-          pip install -r requirements.txt
-          pip install -v .
+          pip install -U pip setuptools==68.2.2 wheel --user
+
      - uses: actions/checkout@v2
        with:
          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-      - name: Download cub for CUDA 10.2
-        run: |
-          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')

-          # check if it is CUDA 10.2
-          # download cub
-          if [ "$CUDA_VERSION" = "10.2" ]; then
-            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-            unzip 1.8.0.zip
-            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
-          fi
      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v .
-          pip install -r requirements/requirements-test.txt
+          BUILD_EXT=1 pip install -v -e .
+          pip install --no-cache-dir -r requirements/requirements-test.txt
+
+      - name: Install tensornvme
+        run: |
+          DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
+
      - name: Unit Testing
        run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --durations=0 tests
        env:
          DATA: /data/scratch/cifar-10
-          NCCL_SHM_DISABLE: 1
-          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib
          LLAMA_PATH: /data/scratch/llama-tiny
+          MOE_TENSOR_PATH: /data/scratch/moe_tensors
+          HF_ENDPOINT: https://hf-mirror.com
--- a/.github/workflows/compatiblity_test_on_pr.yml
+++ b/.github/workflows/compatiblity_test_on_pr.yml
@ -41,50 +41,36 @@ jobs:
      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
    container:
      image: ${{ matrix.container }}
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
-    timeout-minutes: 120
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
+    timeout-minutes: 200
    concurrency:
      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }}
      cancel-in-progress: true
    steps:
      - name: Install dependencies
        run: |
-          pip install -U pip setuptools wheel --user
-      - uses: actions/checkout@v2
-        with:
-          repository: hpcaitech/TensorNVMe
-          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-          path: TensorNVMe
-      - name: Install tensornvme
-        run: |
-          cd TensorNVMe
          apt update && apt install -y cmake
-          pip install -r requirements.txt
-          pip install -v .
+          pip install -U pip setuptools==68.2.2 wheel --user
+
      - uses: actions/checkout@v2
        with:
          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-      - name: Download cub for CUDA 10.2
-        run: |
-          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
-
-          # check if it is CUDA 10.2
-          # download cub
-          if [ "$CUDA_VERSION" = "10.2" ]; then
-            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-            unzip 1.8.0.zip
-            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
-          fi

      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v .
-          pip install -r requirements/requirements-test.txt
+          BUILD_EXT=1 pip install -v -e .
+          pip install --no-cache-dir -r requirements/requirements-test.txt
+
+      - name: Install tensornvme
+        run: |
+          DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git
+
      - name: Unit Testing
        run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --durations=0 tests
        env:
          DATA: /data/scratch/cifar-10
-          NCCL_SHM_DISABLE: 1
-          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib
          LLAMA_PATH: /data/scratch/llama-tiny
+          MOE_TENSOR_PATH: /data/scratch/moe_tensors
+          HF_ENDPOINT: https://hf-mirror.com
--- a/.github/workflows/compatiblity_test_on_schedule.yml
+++ b/.github/workflows/compatiblity_test_on_schedule.yml
@ -38,54 +38,36 @@ jobs:
      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
    container:
      image: ${{ matrix.container }}
-      options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
-    timeout-minutes: 120
+      options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
+    timeout-minutes: 200
    steps:
      - name: Install dependencies
        run: |
-          pip install -U pip setuptools wheel --user
-
-      - uses: actions/checkout@v2
-        with:
-          repository: hpcaitech/TensorNVMe
-          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
-          path: TensorNVMe
-
-      - name: Install tensornvme
-        run: |
-          cd TensorNVMe
          apt update && apt install -y cmake
-          pip install -r requirements.txt
-          pip install -v .
+          pip install -U pip setuptools==68.2.2 wheel --user
+
      - uses: actions/checkout@v2
        with:
          ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}

-      - name: Download cub for CUDA 10.2
-        run: |
-          CUDA_VERSION=$(nvcc -V | awk -F ',| ' '/release/{print $6}')
-
-          # check if it is CUDA 10.2
-          # download cub
-          if [ "$CUDA_VERSION" = "10.2" ]; then
-            wget https://github.com/NVIDIA/cub/archive/refs/tags/1.8.0.zip
-            unzip 1.8.0.zip
-            cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/
-          fi
-
      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v .
-          pip install -r requirements/requirements-test.txt
+          BUILD_EXT=1 pip install -v -e .
+          pip install --no-cache-dir -r requirements/requirements-test.txt
+
+      - name: Install tensornvme
+        run: |
+          DISABLE_URING=1 pip install -v git+https://github.com/hpcaitech/TensorNVMe.git

      - name: Unit Testing
        run: |
-          PYTHONPATH=$PWD pytest tests
+          PYTHONPATH=$PWD pytest --durations=0 tests
        env:
          DATA: /data/scratch/cifar-10
-          NCCL_SHM_DISABLE: 1
-          LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
+          LD_LIBRARY_PATH: /github/home/.tensornvme/lib
          LLAMA_PATH: /data/scratch/llama-tiny
+          MOE_TENSOR_PATH: /data/scratch/moe_tensors
+          HF_ENDPOINT: https://hf-mirror.com

      - name: Notify Lark
        id: message-preparation
--- a/.github/workflows/cuda_ext_check_before_merge.yml
+++ b/.github/workflows/cuda_ext_check_before_merge.yml
@ -51,4 +51,4 @@ jobs:

      - name: Build
        run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
--- a/.github/workflows/doc_check_on_pr.yml
+++ b/.github/workflows/doc_check_on_pr.yml
@ -58,6 +58,7 @@ jobs:
      # there is no main branch, so it's safe to checkout the main branch from the merged branch
      # docer will rebase the remote main branch to the merged branch, so we have to config user
      - name: Make the merged branch main
+
        run: |
          cd ColossalAI
          git checkout -b main
--- a/.github/workflows/doc_test_on_pr.yml
+++ b/.github/workflows/doc_test_on_pr.yml
@ -56,9 +56,9 @@ jobs:
    needs: detect-changed-doc
    runs-on: [self-hosted, gpu]
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
      options: --gpus all --rm
-    timeout-minutes: 20
+    timeout-minutes: 30
    defaults:
      run:
        shell: bash
@ -89,7 +89,7 @@ jobs:
      - name: Install ColossalAI
        run: |
          source activate pytorch
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .

      - name: Test the Doc
        run: |
--- a/.github/workflows/doc_test_on_schedule.yml
+++ b/.github/workflows/doc_test_on_schedule.yml
@ -12,7 +12,7 @@ jobs:
    name: Test the changed Doc
    runs-on: [self-hosted, gpu]
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
      options: --gpus all --rm
    timeout-minutes: 60
    steps:
@ -32,7 +32,7 @@ jobs:

      - name: Install ColossalAI
        run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .

      - name: Install Doc Test Requirements
        run: |
--- a/.github/workflows/example_check_on_dispatch.yml
+++ b/.github/workflows/example_check_on_dispatch.yml
@ -45,20 +45,18 @@ jobs:
      fail-fast: false
      matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/examples-data:/data/
-    timeout-minutes: 10
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
+    timeout-minutes: 15
    steps:
      - name: 📚 Checkout
        uses: actions/checkout@v3
      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
      - name: Test the example
        run: |
          dir=${{ matrix.directory }}
          echo "Testing ${dir} now"
          cd "${PWD}/examples/${dir}"
          bash test_ci.sh
-        env:
-          NCCL_SHM_DISABLE: 1
--- a/.github/workflows/example_check_on_pr.yml
+++ b/.github/workflows/example_check_on_pr.yml
@ -8,6 +8,8 @@ on:
    # any change in the examples folder will trigger check for the corresponding example.
    paths:
      - "examples/**"
+      - "!examples/**.md"
+      - ".github/workflows/example_check_on_pr.yml"

 jobs:
  # This is for changed example files detect and output a matrix containing all the corresponding directory name.
@ -19,6 +21,7 @@ jobs:
    outputs:
      matrix: ${{ steps.setup-matrix.outputs.matrix }}
      anyChanged: ${{ steps.setup-matrix.outputs.anyChanged }}
+      anyExtensionFileChanged: ${{ steps.find-extension-change.outputs.any_changed }}
    name: Detect changed example files
    concurrency:
      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-detect-change
@ -37,6 +40,16 @@ jobs:
          echo $commonCommit
          echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT

+      - name: Find the changed extension-related files
+        id: find-extension-change
+        uses: tj-actions/changed-files@v35
+        with:
+          base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
+          files: |
+            op_builder/**
+            colossalai/kernel/**
+            setup.py
+
      - name: Get all changed example files
        id: changed-files
        uses: tj-actions/changed-files@v35
@ -77,23 +90,32 @@ jobs:
      fail-fast: false
      matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/examples-data:/data/
-    timeout-minutes: 10
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
+    timeout-minutes: 30
    concurrency:
      group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }}
      cancel-in-progress: true
    steps:
      - uses: actions/checkout@v3

+      - name: Restore Colossal-AI Cache
+        if: needs.detect.outputs.anyExtensionFileChanged != 'true'
+        run: |
+          if [ -d /github/home/cuda_ext_cache ] && [ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ]; then
+            cp -p -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
+          fi
+
      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
+
+      - name: Store Colossal-AI Cache
+        run: |
+          cp -p -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/

      - name: Test the example
        run: |
          example_dir=${{ matrix.directory }}
          cd "${PWD}/examples/${example_dir}"
          bash test_ci.sh
-        env:
-          NCCL_SHM_DISABLE: 1
--- a/.github/workflows/example_check_on_schedule.yml
+++ b/.github/workflows/example_check_on_schedule.yml
@ -34,15 +34,16 @@ jobs:
      fail-fast: false
      matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-    timeout-minutes: 10
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
+    timeout-minutes: 30
    steps:
      - name: 📚 Checkout
        uses: actions/checkout@v3

      - name: Install Colossal-AI
        run: |
-          CUDA_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .

      - name: Traverse all files
        run: |
@ -50,8 +51,6 @@ jobs:
          echo "Testing ${example_dir} now"
          cd "${PWD}/examples/${example_dir}"
          bash test_ci.sh
-        env:
-          NCCL_SHM_DISABLE: 1

      - name: Notify Lark
        id: message-preparation
--- a/.github/workflows/post_commit.yml
+++ b/.github/workflows/post_commit.yml
@ -1,97 +0,0 @@
-name: post-commit
-
-on:
-  pull_request:
-    types:
-        - closed
-
-jobs:
-  # this job will run after a PR is merged to run pre-commit on any changed file
-  # so that the user does not need to learn pre-commit and pre-commit can still
-  # be auto-executed by the workflow
-  pre-commit:
-    runs-on: ubuntu-latest
-    if: github.event.pull_request.merged == true && github.repository == 'hpcaitech/ColossalAI'
-    steps:
-    - uses: actions/checkout@v2
-      with:
-          fetch-depth: 0
-          ref: ${{ github.event.pull_request.head.sha }}
-
-    # the PR branch and the hpcaitech/colossal-ai main branch
-    # must share a common commit, we need to locate that commit,
-    # which is the commit checked-out or forked when the PR branch is created
-    # such that we can look for files changed since that commit
-    - name: Locate base commit
-      id: locate-base-sha
-      run: |
-          curBranch=$(git rev-parse --abbrev-ref HEAD)
-          commonCommit=$(git merge-base origin/main $curBranch)
-          echo $commonCommit
-          echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT
-
-    - name: Find the changed files
-      id: find-changed-files
-      uses: tj-actions/changed-files@v35
-      with:
-        base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }}
-
-    - name: List all changed files
-      run: |
-        for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
-          echo "$file was changed"
-        done
-
-    # check out the main branch
-    - uses: actions/checkout@v2
-      with:
-        ref: 'main'
-
-    - uses: actions/setup-python@v3
-
-    - name: Cache pre-commit hooks
-      uses: actions/cache@v3
-      with:
-        path: ~/.cache/pre-commit
-        key: ${{ runner.os }}-pre-commit-hooks
-
-    - name: Set up pre-commit
-      run: |
-        pip install pre-commit
-        pre-commit install
-
-    # run pre-commit on changed files
-    - name: Run Pre-commit
-      run: |
-        for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do
-          pre-commit run --files $file || true
-        done
-
-    # create commit for pre-commit
-    # when all files are well formatted, there is no need to create a commit
-    # therefore, this step will produce an error, which should be allowed
-    - name: Create commits
-      id: commit
-      continue-on-error: true
-      run: |
-        git config --global user.name 'github-actions'
-        git config --global user.email 'github-actions@github.com'
-        git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }}
-        git add -A
-        git commit -am "[format] applied code formatting on changed files in pull request ${{ github.event.pull_request.number }}"
-
-    # create pull request
-    - name: Create Pull Request
-      if: steps.commit.outcome == 'success'
-      id: cpr
-      uses: peter-evans/create-pull-request@v4
-      with:
-        branch: pre-commit-${{ github.event.pull_request.number }}
-        title: "[format] applied code formatting on changed files in PR ${{ github.event.pull_request.number }}"
-
-    - name: Enable Auto-merge for the New PR
-      if: steps.commit.outcome == 'success'
-      uses: peter-evans/enable-pull-request-automerge@v2
-      with:
-        pull-request-number: ${{ steps.cpr.outputs.pull-request-number }}
-        merge-method: squash
--- a/.github/workflows/release_docker_after_publish.yml
+++ b/.github/workflows/release_docker_after_publish.yml
@ -24,10 +24,12 @@ jobs:
          version=$(cat version.txt)
          tag=hpcaitech/colossalai:$version
          latest=hpcaitech/colossalai:latest
-          docker build --build-arg http_proxy=http://172.17.0.1:7890 --build-arg https_proxy=http://172.17.0.1:7890 --build-arg VERSION=v${version} -t $tag ./docker
+          docker build --build-arg VERSION=v${version} -t $tag ./docker
          docker tag $tag $latest
          echo "tag=${tag}" >> $GITHUB_OUTPUT
          echo "latest=${latest}" >> $GITHUB_OUTPUT
+        env:
+          DOCKER_BUILDKIT: 0

      - name: Log in to Docker Hub
        uses: docker/login-action@f054a8b539a109f9f41c372932f1ae047eff08c9
--- a/.github/workflows/release_nightly_on_schedule.yml
+++ b/.github/workflows/release_nightly_on_schedule.yml
@ -6,11 +6,13 @@ on:
    - cron:  '0 0 * * 6' # release on every Sunday 00:00 UTC time

 jobs:
-  build-n-publish:
+  publish:
    if: github.repository == 'hpcaitech/ColossalAI'
    name: Build and publish Python 🐍 distributions 📦 to PyPI
    runs-on: ubuntu-latest
    timeout-minutes: 20
+    outputs:
+      status: ${{ steps.publish.outcome }}
    steps:
    - uses: actions/checkout@v2

@ -18,7 +20,9 @@ jobs:
      with:
        python-version: '3.8.14'

-    - run: NIGHTLY=1 python setup.py sdist build
+    - run: |
+        python .github/workflows/scripts/update_setup_for_nightly.py
+        python setup.py sdist build

    # publish to PyPI if executed on the main branch
    - name: Publish package to PyPI
@ -31,7 +35,7 @@ jobs:

  notify:
    name: Notify Lark via webhook
-    needs: build-n-publish
+    needs: publish
    runs-on: ubuntu-latest
    if: ${{ always() }} && github.repository == 'hpcaitech/ColossalAI'
    steps:
@ -62,4 +66,4 @@ jobs:
          REPO: ${{ github.repository }}
          RUN_ID: ${{ github.run_id }}
          WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }}
-          STATUS: ${{ steps.publish.outcome }}
+          STATUS: ${{ needs.publish.outputs.status }}
--- a/.github/workflows/release_test_pypi_before_merge.yml
+++ b/.github/workflows/release_test_pypi_before_merge.yml
@ -49,6 +49,7 @@ jobs:
        # we need to install the requirements.txt first
        # as test-pypi may not contain the distributions for libs listed in the txt file
        pip install -r requirements/requirements.txt
-        pip install --index-url https://test.pypi.org/simple/ colossalai==$VERSION
+        pip install -U setuptools==68.2.2 wheel
+        pip install --index-url https://test.pypi.org/simple/ --extra-index-url https://pypi.python.org/pypi colossalai==$VERSION
      env:
        VERSION: ${{ steps.prep-version.outputs.version }}
--- a/.github/workflows/run_chatgpt_examples.yml
+++ b/.github/workflows/run_chatgpt_examples.yml
@ -4,10 +4,11 @@ on:
  pull_request:
    types: [synchronize, opened, reopened]
    paths:
-      - "applications/Chat/coati/**"
-      - "applications/Chat/requirements.txt"
-      - "applications/Chat/setup.py"
-      - "applications/Chat/examples/**"
+      - "applications/ColossalChat/coati/**"
+      - "applications/ColossalChat/requirements.txt"
+      - "applications/ColossalChat/setup.py"
+      - "applications/ColossalChat/examples/**"
+      - "applications/ColossalChat/tests/**"

 jobs:
  tests:
@ -18,9 +19,9 @@ jobs:
      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
    runs-on: [self-hosted, gpu]
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb
-    timeout-minutes: 30
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb
+    timeout-minutes: 60
    defaults:
      run:
        shell: bash
@ -28,26 +29,37 @@ jobs:
      - name: Checkout ColossalAI
        uses: actions/checkout@v2

+      - name: Install Colossal-AI
+        run: |
+          pip install --no-cache-dir -v -e .
+
      - name: Install ChatGPT
        run: |
-          cd applications/Chat
-          pip install -v .
-          pip install -r examples/requirements.txt
+          cd applications/ColossalChat
+          pip install --no-cache-dir -v .
+          pip install --no-cache-dir -r examples/requirements.txt

      - name: Install Transformers
        run: |
-          pip install transformers==4.30.2
+          pip install --no-cache-dir transformers==4.36.2

      - name: Execute Examples
        run: |
-          cd applications/Chat
+          cd applications/ColossalChat
          rm -rf ~/.cache/colossalai
-          ./tests/test_inference.sh
-          ./tests/test_benchmarks.sh
+          mkdir models
+          mkdir sft_data
+          mkdir prompt_data
+          mkdir preference_data
+          mkdir kto_data
+          ./tests/test_data_preparation.sh
          ./tests/test_train.sh
        env:
          NCCL_SHM_DISABLE: 1
          MAX_JOBS: 8
-          SFT_DATASET: /data/scratch/github_actions/chat/data.json
-          PROMPT_DATASET: /data/scratch/github_actions/chat/prompts_en.jsonl
-          PRETRAIN_DATASET: /data/scratch/github_actions/chat/alpaca_data.json
+          PRETRAINED_MODEL_PATH: ./models
+          SFT_DATASET: ./sft_data
+          PROMPT_DATASET: ./prompt_data
+          PROMPT_RLVR_DATASET: ./prompt_data
+          PREFERENCE_DATASET: ./preference_data
+          KTO_DATASET: ./kto_data
--- a/.github/workflows/run_chatgpt_unit_tests.yml
+++ b/.github/workflows/run_chatgpt_unit_tests.yml
@ -4,12 +4,11 @@ on:
  pull_request:
    types: [synchronize, opened, reopened]
    paths:
-      - 'applications/Chat/coati/**'
-      - 'applications/Chat/requirements.txt'
-      - 'applications/Chat/setup.py'
-      - 'applications/Chat/requirements-test.txt'
-      - 'applications/Chat/tests/**'
-      - 'applications/Chat/pytest.ini'
+      - 'applications/ColossalChat/coati/**'
+      - 'applications/ColossalChat/requirements.txt'
+      - 'applications/ColossalChat/setup.py'
+      - 'applications/ColossalChat/tests/**'
+      - 'applications/ColossalChat/pytest.ini'

 jobs:
  tests:
@ -20,8 +19,8 @@ jobs:
      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
    runs-on: [self-hosted, gpu]
    container:
-      image: hpcaitech/pytorch-cuda:1.12.0-11.3.0
-      options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data
    timeout-minutes: 30
    defaults:
      run:
@ -32,15 +31,17 @@ jobs:

      - name: Install ChatGPT
        run: |
-          cd applications/Chat
+          cd applications/ColossalChat
          pip install -v .
-          pip install -r requirements-test.txt
+          pip install pytest

      - name: Execute Unit Testing
        run: |
-          cd applications/Chat
+          cd applications/ColossalChat
          rm -rf ~/.cache/colossalai
          pytest tests/
+          cd ./tests
+          ./test_templating.sh
        env:
          NCCL_SHM_DISABLE: 1
          MAX_JOBS: 8
--- a/.github/workflows/run_colossalqa_unit_tests.yml
+++ b/.github/workflows/run_colossalqa_unit_tests.yml
@ -0,0 +1,54 @@
+name: Run colossalqa unit tests
+
+on:
+  pull_request:
+    types: [synchronize, opened, reopened]
+    paths:
+      - 'applications/ColossalQA/colossalqa/**'
+      - 'applications/ColossalQA/requirements.txt'
+      - 'applications/ColossalQA/setup.py'
+      - 'applications/ColossalQA/tests/**'
+      - 'applications/ColossalQA/pytest.ini'
+
+jobs:
+  tests:
+    name: Run colossalqa unit tests
+    if: |
+      github.event.pull_request.draft == false &&
+      github.base_ref == 'main' &&
+      github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
+    runs-on: [self-hosted, gpu]
+    container:
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
+      volumes:
+        - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
+        - /data/scratch/llama-tiny:/data/scratch/llama-tiny
+      options: --gpus all --rm
+    timeout-minutes: 30
+    defaults:
+      run:
+        shell: bash
+    steps:
+      - name: Checkout ColossalAI
+        uses: actions/checkout@v2
+
+      - name: Install colossalqa
+        run: |
+          cd applications/ColossalQA
+          pip install -e .
+
+      - name: Execute Unit Testing
+        run: |
+          cd applications/ColossalQA
+          pytest tests/
+        env:
+          NCCL_SHM_DISABLE: 1
+          MAX_JOBS: 8
+          ZH_MODEL_PATH: bigscience/bloom-560m
+          ZH_MODEL_NAME: bloom
+          EN_MODEL_PATH: bigscience/bloom-560m
+          EN_MODEL_NAME: bloom
+          TEST_DATA_PATH_EN: /data/scratch/test_data_colossalqa/companies.txt
+          TEST_DATA_PATH_ZH: /data/scratch/test_data_colossalqa/companies_zh.txt
+          TEST_DOCUMENT_LOADER_DATA_PATH: /data/scratch/test_data_colossalqa/tests/*
+          SQL_FILE_PATH: /data/scratch/test_data_colossalqa/sql_file_path
--- a/.github/workflows/scripts/update_setup_for_nightly.py
+++ b/.github/workflows/scripts/update_setup_for_nightly.py
@ -0,0 +1,34 @@
+from datetime import datetime
+
+
+def open_setup_file():
+    with open("setup.py", "r") as f:
+        file_lines = f.readlines()
+    return file_lines
+
+
+def replace_nightly_package_info(file_lines):
+    version = datetime.today().strftime("%Y.%m.%d")
+    package_name = "colossalai-nightly"
+
+    for idx, line in enumerate(file_lines):
+        if "version = get_version()" in line:
+            file_lines[idx] = f'version = "{version}"\n'
+        if 'package_name = "colossalai"' in line:
+            file_lines[idx] = f'package_name = "{package_name}"\n'
+    return file_lines
+
+
+def write_setup_file(file_lines):
+    with open("setup.py", "w") as f:
+        f.writelines(file_lines)
+
+
+def main():
+    file_lines = open_setup_file()
+    file_lines = replace_nightly_package_info(file_lines)
+    write_setup_file(file_lines)
+
+
+if __name__ == "__main__":
+    main()
--- a/.gitignore
+++ b/.gitignore
@ -159,3 +159,7 @@ coverage.xml
 # ignore testmon and coverage files
 .coverage
 .testmondata*
+
+# log, test files - ColossalChat
+applications/ColossalChat/logs
+applications/ColossalChat/tests/logs
--- a/.gitmodules
+++ b/.gitmodules
@ -1,7 +1,3 @@
-[submodule "inference"]
-	path = inference
-	url = https://github.com/hpcaitech/EnergonAI.git
-	branch = main
 [submodule "examples/tutorial/fastfold/FastFold"]
 	path = examples/tutorial/fastfold/FastFold
 	url = https://github.com/hpcaitech/FastFold
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@ -1,34 +1,35 @@
 repos:

  - repo: https://github.com/PyCQA/autoflake
-    rev: v2.2.1
+    rev: v2.3.1
    hooks:
      - id: autoflake
        name: autoflake (python)
        args: ['--in-place', '--remove-unused-variables', '--remove-all-unused-imports', '--ignore-init-module-imports']

  - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
    hooks:
      - id: isort
        name: sort all imports (python)
+        args: ["--profile", "black"] # avoid conflict with black

  - repo: https://github.com/psf/black-pre-commit-mirror
-    rev: 23.9.1
+    rev: 24.10.0
    hooks:
    - id: black
      name: black formatter
      args: ['--line-length=120', '--target-version=py37', '--target-version=py38', '--target-version=py39','--target-version=py310']

  - repo: https://github.com/pre-commit/mirrors-clang-format
-    rev: v13.0.1
+    rev: v19.1.5
    hooks:
    - id: clang-format
      name: clang formatter
      types_or: [c++, c]

  - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.3.0
+    rev: v5.0.0
    hooks:
      - id: check-yaml
      - id: check-merge-conflict
--- a/40
+++ b/40
@ -527,3 +527,43 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved.
   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
   SOFTWARE.
+
+
+   ---------------- LICENSE FOR LangChain TEAM ----------------
+
+   The MIT License
+
+   Copyright (c) Harrison Chase
+
+   Permission is hereby granted, free of charge, to any person obtaining a copy
+   of this software and associated documentation files (the "Software"), to deal
+   in the Software without restriction, including without limitation the rights
+   to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+   copies of the Software, and to permit persons to whom the Software is
+   furnished to do so, subject to the following conditions:
+
+   The above copyright notice and this permission notice shall be included in
+   all copies or substantial portions of the Software.
+
+   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+   IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+   FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+   AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+   LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+   OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+   THE SOFTWARE.
+   ---------------- LICENSE FOR Hugging Face accelerate ----------------
+
+   Copyright 2021 The HuggingFace Team
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
--- a/MANIFEST.in
+++ b/MANIFEST.in
@ -1,4 +1,4 @@
 include *.txt README.md
 recursive-include requirements *.txt
 recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
-recursive-include op_builder *.py
+recursive-include extensions *.py *.cpp *.h *.cu *.tr *.cuh *.cc *.pyi
--- a/README.md
+++ b/README.md
@ -9,7 +9,8 @@
   <a href="https://www.colossalai.org/"> Documentation </a> |
   <a href="https://github.com/hpcaitech/ColossalAI/tree/main/examples"> Examples </a> |
   <a href="https://github.com/hpcaitech/ColossalAI/discussions"> Forum </a> |
-   <a href="https://medium.com/@hpcaitech"> Blog </a></h3>
+   <a href="https://colossalai.org/zh-Hans/docs/get_started/bonus/">GPU Cloud Playground </a> |
+   <a href="https://hpc-ai.com/blog"> Blog </a></h3>

   [![GitHub Repo stars](https://img.shields.io/github/stars/hpcaitech/ColossalAI?style=social)](https://github.com/hpcaitech/ColossalAI/stargazers)
   [![Build](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml/badge.svg)](https://github.com/hpcaitech/ColossalAI/actions/workflows/build_on_schedule.yml)
@ -24,16 +25,34 @@

 </div>

+## Get Started with Colossal-AI Without Setup
+
+Access high-end, on-demand compute for your research instantly—no setup needed.
+
+Sign up now and get $10 in credits!
+
+Limited Academic Bonuses:
+
+* Top up $1,000 and receive 300 credits
+* Top up $500 and receive 100 credits
+
+<div align="center">
+   <a href="https://hpc-ai.com/?utm_source=github&utm_medium=social&utm_campaign=promotion-colossalai">
+   <img src="https://github.com/hpcaitech/public_assets/blob/main/colossalai/img/2-2.gif" width="850" />
+   </a>
+</div>
+
+
 ## Latest News
-* [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
-* [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training)
-* [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth)
-* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining)
-* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
-* [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana)
-* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs)
-* [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
-* [2023/01] [Hardware Savings Up to 46 Times for AIGC and  Automatic Parallelism](https://medium.com/pytorch/latest-colossal-ai-boasts-novel-automatic-parallelism-and-offers-savings-up-to-46x-for-stable-1453b48f3f02)
+* [2025/02] [DeepSeek 671B Fine-Tuning Guide Revealed—Unlock the Upgraded DeepSeek Suite with One Click, AI Players Ecstatic!](https://company.hpc-ai.com/blog/shocking-release-deepseek-671b-fine-tuning-guide-revealed-unlock-the-upgraded-deepseek-suite-with-one-click-ai-players-ecstatic)
+* [2024/12] [The development cost of video generation models has saved by 50%! Open-source solutions are now available with H200 GPU vouchers](https://company.hpc-ai.com/blog/the-development-cost-of-video-generation-models-has-saved-by-50-open-source-solutions-are-now-available-with-h200-gpu-vouchers) [[code]](https://github.com/hpcaitech/Open-Sora/blob/main/scripts/train.py) [[vouchers]](https://colossalai.org/zh-Hans/docs/get_started/bonus/)
+* [2024/10] [How to build a low-cost Sora-like app? Solutions for you](https://company.hpc-ai.com/blog/how-to-build-a-low-cost-sora-like-app-solutions-for-you)
+* [2024/09] [Singapore Startup HPC-AI Tech Secures 50 Million USD in Series A Funding to Build the Video Generation AI Model and GPU Platform](https://company.hpc-ai.com/blog/singapore-startup-hpc-ai-tech-secures-50-million-usd-in-series-a-funding-to-build-the-video-generation-ai-model-and-gpu-platform)
+* [2024/09] [Reducing AI Large Model Training Costs by 30% Requires Just a Single Line of Code From FP8 Mixed Precision Training Upgrades](https://company.hpc-ai.com/blog/reducing-ai-large-model-training-costs-by-30-requires-just-a-single-line-of-code-from-fp8-mixed-precision-training-upgrades)
+* [2024/06] [Open-Sora Continues Open Source: Generate Any 16-Second 720p HD Video with One Click, Model Weights Ready to Use](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
+* [2024/05] [Large AI Models Inference Speed Doubled, Colossal-Inference Open Source Release](https://hpc-ai.com/blog/colossal-inference)
+* [2024/04] [Open-Sora Unveils Major Upgrade: Embracing Open Source with Single-Shot 16-Second Video Generation and 720p Resolution](https://hpc-ai.com/blog/open-soras-comprehensive-upgrade-unveiled-embracing-16-second-video-generation-and-720p-resolution-in-open-source)
+* [2024/04] [Most cost-effective solutions for inference, fine-tuning and pretraining, tailored to LLaMA3 series](https://hpc-ai.com/blog/most-cost-effective-solutions-for-inference-fine-tuning-and-pretraining-tailored-to-llama3-series)

 ## Table of Contents
 <ul>
@ -42,6 +61,7 @@
 <li>
   <a href="#Colossal-AI-in-the-Real-World">Colossal-AI for Real World Applications</a>
   <ul>
+     <li><a href="#Open-Sora">Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models</a></li>
     <li><a href="#Colossal-LLaMA-2">Colossal-LLaMA-2: One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific Llm Solution</a></li>
     <li><a href="#ColossalChat">ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline</a></li>
     <li><a href="#AIGC">AIGC: Acceleration of Stable Diffusion</a></li>
@ -51,7 +71,8 @@
 <li>
   <a href="#Parallel-Training-Demo">Parallel Training Demo</a>
   <ul>
-     <li><a href="#LLaMA2">LLaMA 1/2</a></li>
+     <li><a href="#LLaMA3">LLaMA 1/2/3 </a></li>
+     <li><a href="#MoE">MoE</a></li>
     <li><a href="#GPT-3">GPT-3</a></li>
     <li><a href="#GPT-2">GPT-2</a></li>
     <li><a href="#BERT">BERT</a></li>
@ -69,11 +90,11 @@
   </ul>
 </li>
 <li>
-   <a href="#Inference-Energon-AI-Demo">Inference (Energon-AI) Demo</a>
+   <a href="#Inference">Inference</a>
   <ul>
-     <li><a href="#GPT-3-Inference">GPT-3</a></li>
-     <li><a href="#OPT-Serving">OPT-175B Online Serving for Text Generation</a></li>
-     <li><a href="#BLOOM-Inference">176B BLOOM</a></li>
+     <li><a href="#Colossal-Inference">Colossal-Inference: Large AI  Models Inference Speed Doubled</a></li>
+     <li><a href="#Grok-1">Grok-1: 314B model of PyTorch + HuggingFace Inference</a></li>
+     <li><a href="#SwiftInfer">SwiftInfer:Breaks the Length Limit of LLM for Multi-Round Conversations with 46% Acceleration</a></li>
   </ul>
 </li>
 <li>
@ -120,43 +141,65 @@ distributed training and inference in a few lines.
 - Friendly Usage
  - Parallelism based on the configuration file

- Inference
-  - [Energon-AI](https://github.com/hpcaitech/EnergonAI)
-
 <p align="right">(<a href="#top">back to top</a>)</p>

 ## Colossal-AI in the Real World
+### Open-Sora
+
+[Open-Sora](https://github.com/hpcaitech/Open-Sora)：Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models
+[[code]](https://github.com/hpcaitech/Open-Sora)
+[[blog]](https://hpc-ai.com/blog/open-sora-from-hpc-ai-tech-team-continues-open-source-generate-any-16-second-720p-hd-video-with-one-click-model-weights-ready-to-use)
+[[Model weights]](https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#model-weights)
+[[Demo]](https://github.com/hpcaitech/Open-Sora?tab=readme-ov-file#-latest-demo)
+[[GPU Cloud Playground]](https://cloud.luchentech.com/)
+[[OpenSora Image]](https://cloud.luchentech.com/doc/docs/image/open-sora/)
+
+<div align="center">
+   <a href="https://youtu.be/ilMQpU71ddI?si=J4JSPzZ03ycYmlki">
+   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/sora/opensora-v1.2.png" width="700" />
+   </a>
+</div>
+
+<p align="right">(<a href="#top">back to top</a>)</p>

 ### Colossal-LLaMA-2

- One half-day of training using a few hundred dollars yields similar results to mainstream large models, open-source and commercial-free domain-specific LLM solution.
+[[GPU Cloud Playground]](https://cloud.luchentech.com/)
+[[LLaMA3 Image]](https://cloud.luchentech.com/doc/docs/image/llama)
+
+- 7B: One half-day of training using a few hundred dollars yields similar results to mainstream large models, open-source and commercial-free domain-specific LLM solution.
 [[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
 [[blog]](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution)
 [[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-7b-base)
 [[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-7b-base/summary)

-|                                |  Backbone  | Tokens Consumed |  |         MMLU         |     CMMLU     | AGIEval | GAOKAO | CEval  |
-| :----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :-----: | :----: | :----: | :------------------------------: |
-|                                |           |        -        |                |        5-shot        |    5-shot     | 5-shot  | 0-shot | 5-shot |
-|          Baichuan-7B           |     -      |      1.2T       |             |    42.32 (42.30)     | 44.53 (44.02) |  38.72  | 36.74  | 42.80  |
-|       Baichuan-13B-Base        |     -      |      1.4T       |             |    50.51 (51.60)     | 55.73 (55.30) |  47.20  | 51.41  | 53.60  |
-|       Baichuan2-7B-Base        |     -      |      2.6T       |             |    46.97 (54.16)     | 57.67 (57.07) |  45.76  | 52.60  | 54.00  |
-|       Baichuan2-13B-Base       |     -      |      2.6T       |             |    54.84 (59.17)     | 62.62 (61.97) |  52.08  | 58.25  | 58.10  |
-|           ChatGLM-6B           |     -      |      1.0T       |             |    39.67 (40.63)     |   41.17 (-)   |  40.10  | 36.53  | 38.90  |
-|          ChatGLM2-6B           |     -      |      1.4T       |             |    44.74 (45.46)     |   49.40 (-)   |  46.36  | 45.49  | 51.70  |
-|          InternLM-7B           |     -      |      1.6T       |                |    46.70 (51.00)     |   52.00 (-)   |  44.77  | 61.64  | 52.80  |
-|            Qwen-7B             |     -      |      2.2T       |             | 54.29 (56.70) | 56.03 (58.80) |  52.47  | 56.42  | 59.60  |
-|                                |            |                 |                 |                      |               |         |        |        |
-|           Llama-2-7B           |     -      |      2.0T       |             |    44.47 (45.30)     |   32.97 (-)   |  32.60  | 25.46  |   -    |
-| Linly-AI/Chinese-LLaMA-2-7B-hf | Llama-2-7B |      1.0T       |             |        37.43         |     29.92     |  32.00  | 27.57  |   -    |
-| wenge-research/yayi-7b-llama2  | Llama-2-7B |        -        |                |        38.56         |     31.52     |  30.99  | 25.95  |   -    |
-| ziqingyang/chinese-llama-2-7b  | Llama-2-7B |        -        |                |        33.86         |     34.69     |  34.52  | 25.18  |  34.2  |
-| TigerResearch/tigerbot-7b-base | Llama-2-7B |      0.3T       |             |        43.73         |     42.04     |  37.64  | 30.61  |   -    |
-|  LinkSoul/Chinese-Llama-2-7b   | Llama-2-7B |        -        |                |        48.41         |     38.31     |  38.45  | 27.72  |   -    |
-|       FlagAlpha/Atom-7B        | Llama-2-7B |      0.1T       |             |        49.96         |     41.10     |  39.83  | 33.00  |   -    |
-| IDEA-CCNL/Ziya-LLaMA-13B-v1.1  | Llama-13B  |      0.11T      |            |        50.25         |     40.99     |  40.04  | 30.54  |   -    |
-|  |  |  |  |  |  |  |  |  |
-|    **Colossal-LLaMA-2-7b-base**    | Llama-2-7B |      **0.0085T**      |            |        53.06         |     49.89     |  51.48  | 58.82  |  50.2  |
+- 13B: Construct refined 13B private model with just $5000 USD.
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Colossal-LLaMA-2)
+[[blog]](https://hpc-ai.com/blog/colossal-llama-2-13b)
+[[HuggingFace model weights]](https://huggingface.co/hpcai-tech/Colossal-LLaMA-2-13b-base)
+[[Modelscope model weights]](https://www.modelscope.cn/models/colossalai/Colossal-LLaMA-2-13b-base/summary)
+
+|              Model              |  Backbone  | Tokens Consumed |     MMLU (5-shot)    | CMMLU (5-shot)| AGIEval (5-shot) | GAOKAO (0-shot) | CEval (5-shot)  |
+| :-----------------------------: | :--------: | :-------------: | :------------------: | :-----------: | :--------------: | :-------------: | :-------------: |
+|          Baichuan-7B            |     -      |      1.2T       |    42.32 (42.30)     | 44.53 (44.02) |        38.72     |       36.74     |       42.80     |
+|       Baichuan-13B-Base         |     -      |      1.4T       |    50.51 (51.60)     | 55.73 (55.30) |        47.20     |       51.41     |       53.60     |
+|       Baichuan2-7B-Base         |     -      |      2.6T       |    46.97 (54.16)     | 57.67 (57.07) |        45.76     |       52.60     |       54.00     |
+|       Baichuan2-13B-Base        |     -      |      2.6T       |    54.84 (59.17)     | 62.62 (61.97) |        52.08     |       58.25     |       58.10     |
+|           ChatGLM-6B            |     -      |      1.0T       |    39.67 (40.63)     |   41.17 (-)   |        40.10     |       36.53     |       38.90     |
+|          ChatGLM2-6B            |     -      |      1.4T       |    44.74 (45.46)     |   49.40 (-)   |        46.36     |       45.49     |       51.70     |
+|          InternLM-7B            |     -      |      1.6T       |    46.70 (51.00)     |   52.00 (-)   |        44.77     |       61.64     |       52.80     |
+|            Qwen-7B              |     -      |      2.2T       |    54.29 (56.70)     | 56.03 (58.80) |        52.47     |       56.42     |       59.60     |
+|           Llama-2-7B            |     -      |      2.0T       |    44.47 (45.30)     |   32.97 (-)   |        32.60     |       25.46     |         -       |
+| Linly-AI/Chinese-LLaMA-2-7B-hf  | Llama-2-7B |      1.0T       |        37.43         |     29.92     |        32.00     |       27.57     |         -       |
+| wenge-research/yayi-7b-llama2   | Llama-2-7B |        -        |        38.56         |     31.52     |        30.99     |       25.95     |         -       |
+| ziqingyang/chinese-llama-2-7b   | Llama-2-7B |        -        |        33.86         |     34.69     |        34.52     |       25.18     |        34.2     |
+| TigerResearch/tigerbot-7b-base  | Llama-2-7B |      0.3T       |        43.73         |     42.04     |        37.64     |       30.61     |         -       |
+|  LinkSoul/Chinese-Llama-2-7b    | Llama-2-7B |        -        |        48.41         |     38.31     |        38.45     |       27.72     |         -       |
+|       FlagAlpha/Atom-7B         | Llama-2-7B |      0.1T       |        49.96         |     41.10     |        39.83     |       33.00     |         -       |
+| IDEA-CCNL/Ziya-LLaMA-13B-v1.1   | Llama-13B  |      0.11T      |        50.25         |     40.99     |        40.04     |       30.54     |         -       |
+|  **Colossal-LLaMA-2-7b-base**   | Llama-2-7B |   **0.0085T**   |        53.06         |     49.89     |        51.48     |       58.82     |        50.2     |
+|  **Colossal-LLaMA-2-13b-base**  | Llama-2-13B |   **0.025T**    |        56.42         |     61.80     |        54.69     |       69.53     |        60.3     |
+

 ### ColossalChat

@ -215,7 +258,7 @@ Acceleration of AIGC (AI-Generated Content) models such as [Stable Diffusion v1]

 - [DreamBooth Fine-tuning](https://github.com/hpcaitech/ColossalAI/tree/main/examples/images/dreambooth): Personalize your model using just 3-5 images of the desired subject.

-<p id="inference" align="center">
+<p id="inference-sd" align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Stable%20Diffusion%20Inference.jpg" width=800/>
 </p>

@ -249,13 +292,23 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 <p align="right">(<a href="#top">back to top</a>)</p>

 ## Parallel Training Demo
+### LLaMA3
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/LLaMA3-70B-H100.png" width=600/>
+</p>
+
+- 70 billion parameter LLaMA3 model training accelerated by 18%
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama)
+[[GPU Cloud Playground]](https://cloud.luchentech.com/)
+[[LLaMA3 Image]](https://cloud.luchentech.com/doc/docs/image/llama)
+
 ### LLaMA2
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/llama2_pretraining.png" width=600/>
 </p>

 - 70 billion parameter LLaMA2 model training accelerated by 195%
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama2)
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama)
 [[blog]](https://www.hpc-ai.tech/blog/70b-llama2-training)

 ### LLaMA1
@ -264,9 +317,18 @@ Acceleration of [AlphaFold Protein Structure](https://alphafold.ebi.ac.uk/)
 </p>

 - 65-billion-parameter large model pretraining accelerated by 38%
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/example/llama/examples/language/llama)
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/llama)
 [[blog]](https://www.hpc-ai.tech/blog/large-model-pretraining)

+### MoE
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/MOE_training.png" width=800/>
+</p>
+
+- Enhanced MoE parallelism, Open-source MoE model training can be 9 times more efficient
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/openmoe)
+[[blog]](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient)
+
 ### GPT-3
 <p align="center">
 <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/GPT3-v5.png" width=700/>
@ -336,32 +398,47 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 <p align="right">(<a href="#top">back to top</a>)</p>


-## Inference (Energon-AI) Demo
-
-<p id="GPT-3-Inference" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference_GPT-3.jpg" width=800/>
+## Inference
+### Colossal-Inference
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference/colossal-inference-v1-1.png" width=1000/>
 </p>

- [Energon-AI](https://github.com/hpcaitech/EnergonAI): 50% inference acceleration on the same hardware
-
-<p id="OPT-Serving" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20serving.png" width=600/>
+<p align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/inference/colossal-inference-v1-2.png" width=1000/>
 </p>

- [OPT Serving](https://colossalai.org/docs/advanced_tutorials/opt_service): Try 175-billion-parameter OPT online services
+ - Large AI models inference speed doubled, compared to the offline inference performance of vLLM in some cases.
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/inference)
+[[blog]](https://hpc-ai.com/blog/colossal-inference)
+[[GPU Cloud Playground]](https://cloud.luchentech.com/)
+[[LLaMA3 Image]](https://cloud.luchentech.com/doc/docs/image/llama)

-<p id="BLOOM-Inference" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/BLOOM%20Inference.PNG" width=800/>
+### Grok-1
+<p id="Grok-1" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/examples/images/grok-1-inference.jpg" width=600/>
 </p>

- [BLOOM](https://github.com/hpcaitech/EnergonAI/tree/main/examples/bloom): Reduce hardware deployment costs of 176-billion-parameter BLOOM by more than 10 times.
+ - 314 Billion Parameter Grok-1 Inference Accelerated by 3.8x, an easy-to-use Python + PyTorch + HuggingFace version for Inference.
+
+[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/examples/language/grok-1)
+[[blog]](https://hpc-ai.com/blog/314-billion-parameter-grok-1-inference-accelerated-by-3.8x-efficient-and-easy-to-use-pytorchhuggingface-version-is-here)
+[[HuggingFace Grok-1 PyTorch model weights]](https://huggingface.co/hpcai-tech/grok-1)
+[[ModelScope Grok-1 PyTorch model weights]](https://www.modelscope.cn/models/colossalai/grok-1-pytorch/summary)
+
+### SwiftInfer
+<p id="SwiftInfer" align="center">
+<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/SwiftInfer.jpg" width=800/>
+</p>
+
+- [SwiftInfer](https://github.com/hpcaitech/SwiftInfer): Inference performance improved by 46%, open source solution breaks the length limit of LLM for multi-round conversations

 <p align="right">(<a href="#top">back to top</a>)</p>

 ## Installation

 Requirements:
- PyTorch >= 1.11 (PyTorch 2.x in progress)
+- PyTorch >= 2.2
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
@ -379,10 +456,10 @@ pip install colossalai

 **Note: only Linux is supported for now.**

-However, if you want to build the PyTorch extensions during installation, you can set `CUDA_EXT=1`.
+However, if you want to build the PyTorch extensions during installation, you can set `BUILD_EXT=1`.

 ```bash
-CUDA_EXT=1 pip install colossalai
+BUILD_EXT=1 pip install colossalai
 ```

 **Otherwise, CUDA kernels will be built during runtime when you actually need them.**
@ -410,7 +487,7 @@ By default, we do not compile CUDA/C++ kernels. ColossalAI will build them durin
 If you want to install and enable CUDA kernel fusion (compulsory installation when using fused optimizer):

 ```shell
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```

 For Users with CUDA 10.2, you can still build ColossalAI from source. However, you need to manually download the cub library and copy it to the corresponding directory.
@ -426,7 +503,7 @@ unzip 1.8.0.zip
 cp -r cub-1.8.0/cub/ colossalai/kernel/cuda_native/csrc/kernels/include/

 # install
-CUDA_EXT=1 pip install .
+BUILD_EXT=1 pip install .
 ```

 <p align="right">(<a href="#top">back to top</a>)</p>
@ -495,11 +572,22 @@ This project is inspired by some related projects (some by our team and some by
 To cite this project, you can use the following BibTeX citation.

 ```
-@article{bian2021colossal,
-  title={Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
-  author={Bian, Zhengda and Liu, Hongxin and Wang, Boxiang and Huang, Haichen and Li, Yongbin and Wang, Chuanrui and Cui, Fan and You, Yang},
-  journal={arXiv preprint arXiv:2110.14883},
-  year={2021}
+@inproceedings{10.1145/3605573.3605613,
+author = {Li, Shenggui and Liu, Hongxin and Bian, Zhengda and Fang, Jiarui and Huang, Haichen and Liu, Yuliang and Wang, Boxiang and You, Yang},
+title = {Colossal-AI: A Unified Deep Learning System For Large-Scale Parallel Training},
+year = {2023},
+isbn = {9798400708435},
+publisher = {Association for Computing Machinery},
+address = {New York, NY, USA},
+url = {https://doi.org/10.1145/3605573.3605613},
+doi = {10.1145/3605573.3605613},
+abstract = {The success of Transformer models has pushed the deep learning model scale to billions of parameters, but the memory limitation of a single GPU has led to an urgent need for training on multi-GPU clusters. However, the best practice for choosing the optimal parallel strategy is still lacking, as it requires domain expertise in both deep learning and parallel computing. The Colossal-AI system addressed the above challenge by introducing a unified interface to scale your sequential code of model training to distributed environments. It supports parallel training methods such as data, pipeline, tensor, and sequence parallelism and is integrated with heterogeneous training and zero redundancy optimizer. Compared to the baseline system, Colossal-AI can achieve up to 2.76 times training speedup on large-scale models.},
+booktitle = {Proceedings of the 52nd International Conference on Parallel Processing},
+pages = {766–775},
+numpages = {10},
+keywords = {datasets, gaze detection, text tagging, neural networks},
+location = {Salt Lake City, UT, USA},
+series = {ICPP '23}
 }
 ```

--- a/applications/Chat/README.md
+++ b/applications/Chat/README.md
@ -1,521 +0,0 @@
-<h1 align="center">
-  <img width="auto" height="100px", src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/logo_coati.png"/>
-  <br/>
-  <span>ColossalChat</span>
-</h1>
-
-## Table of Contents
-
- [Table of Contents](#table-of-contents)
- [What is ColossalChat and Coati ?](#what-is-colossalchat-and-coati-)
- [Online demo](#online-demo)
- [Install](#install)
-  - [Install the environment](#install-the-environment)
-  - [Install the Transformers](#install-the-transformers)
- [How to use?](#how-to-use)
-  - [Supervised datasets collection](#supervised-datasets-collection)
-  - [RLHF Training Stage1 - Supervised instructs tuning](#RLHF-training-stage1---supervised-instructs-tuning)
-  - [RLHF Training Stage2 - Training reward model](#RLHF-training-stage2---training-reward-model)
-  - [RLHF Training Stage3 - Training model with reinforcement learning by human feedback](#RLHF-training-stage3---training-model-with-reinforcement-learning-by-human-feedback)
-  - [Inference Quantization and Serving - After Training](#inference-quantization-and-serving---after-training)
- [Coati7B examples](#coati7b-examples)
-  - [Generation](#generation)
-  - [Open QA](#open-qa)
-  - [Limitation for LLaMA-finetuned models](#limitation)
-  - [Limitation of dataset](#limitation)
- [FAQ](#faq)
-  - [How to save/load checkpoint](#faq)
-  - [How to train with limited resources](#faq)
- [The Plan](#the-plan)
-  - [Real-time progress](#real-time-progress)
- [Invitation to open-source contribution](#invitation-to-open-source-contribution)
- [Quick Preview](#quick-preview)
- [Authors](#authors)
- [Citations](#citations)
- [Licenses](#licenses)
-
---
-
-## What is ColossalChat and Coati ?
-
-[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat) is the project to implement LLM with RLHF, powered by the [Colossal-AI](https://github.com/hpcaitech/ColossalAI) project.
-
-Coati stands for `ColossalAI Talking Intelligence`. It is the name for the module implemented in this project and is also the name of the large language model developed by the ColossalChat project.
-
-The Coati package provides a unified large language model framework that has implemented the following functions
-
- Supports comprehensive large-model training acceleration capabilities for ColossalAI, without requiring knowledge of complex distributed training algorithms
- Supervised datasets collection
- Supervised instructions fine-tuning
- Training reward model
- Reinforcement learning with human feedback
- Quantization inference
- Fast model deploying
- Perfectly integrated with the Hugging Face ecosystem, a high degree of model customization
-
-<div align="center">
-  <p align="center">
-    <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/chatgpt.png" width=700/>
-  </p>
-
-Image source: https://openai.com/blog/chatgpt
-
-</div>
-
-**As Colossal-AI is undergoing some major updates, this project will be actively maintained to stay in line with the Colossal-AI project.**
-
-More details can be found in the latest news.
-
- [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
- [2023/02] [Open Source Solution Replicates ChatGPT Training Process! Ready to go with only 1.6GB GPU Memory](https://www.hpc-ai.tech/blog/colossal-ai-chatgpt)
-
-## Online demo
-
-<div align="center">
-   <a href="https://www.youtube.com/watch?v=HcTiHzApHm0">
-   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20YouTube.png" width="700" />
-   </a>
-</div>
-
-[ColossalChat](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat): An open-source solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline.
-[[code]](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat)
-[[blog]](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b)
-[[demo]](https://www.youtube.com/watch?v=HcTiHzApHm0)
-[[tutorial]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
-
-<p id="ColossalChat-Speed" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/ColossalChat%20Speed.jpg" width=450/>
-</p>
-
-> DeepSpeedChat performance comes from its blog on 2023 April 12, ColossalChat performance can be reproduced on an AWS p4d.24xlarge node with 8 A100-40G GPUs with the following command: `torchrun --standalone --nproc_per_node 8 benchmark_opt_lora_dummy.py --num_collect_steps 1 --use_kernels --strategy colossalai_zero2 --experience_batch_size 64 --train_batch_size 32`
-
-## Install
-
-### Install the environment
-
-```bash
-conda create -n coati
-conda activate coati
-git clone https://github.com/hpcaitech/ColossalAI.git
-cd ColossalAI/applications/Chat
-pip install .
-```
-
-### Install the Transformers
-
-```bash
-pip install transformers==4.30.2
-```
-
-## How to use?
-
-### Supervised datasets collection
-
-We collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
-[InstructionWild](https://github.com/XueFuzhao/InstructionWild) and in this [file](https://github.com/XueFuzhao/InstructionWild/blob/main/data/README.md).
-
-Here is how we collected the data
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/data-collect.png" width=500/>
-</p>
-
-### RLHF Training Stage1 - Supervised instructs tuning
-
-Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
-
-You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
-[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
-
-**Note**: the supervised dataset follows the following format,
-
-```json
-[
-    {
-        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-        "input": "",
-        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-        "id": 0
-    },
-    ...
-]
-```
-
-### RLHF Training Stage2 - Training reward model
-
-Stage2 trains a reward model, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model
-
-You can run the `examples/train_rm.sh` to start a reward model training.
-[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
-
-### RLHF Training Stage3 - Training model with reinforcement learning by human feedback
-
-Stage3 uses reinforcement learning algorithm, which is the most complex part of the training process:
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/stage-3.jpeg" width=800/>
-</p>
-
-You can run the `examples/train_prompts.sh` to start training PPO with human feedback.
-[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
-
-**Note**: the required datasets follow the following format,
-
- `pretrain dataset`
-
-  ```json
-  [
-      {
-          "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-          "input": "",
-          "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-          "id": 0
-      },
-      ...
-  ]
-  ```
-
- `prompt dataset`
-
-  ```json
-  [
-      {
-          "instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
-          "id": 0
-      },
-      {
-          "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
-          "id": 1
-      },
-      ...
-  ]
-  ```
-
-For more details, see [`examples/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples).
-
-### Inference Quantization and Serving - After Training
-
-We provide an online inference server and a benchmark. We aim to run inference on single GPU, so quantization is essential when using large models.
-
-We support 8-bit quantization (RTN), 4-bit quantization (GPTQ), and FP16 inference.
-
-Online inference server scripts can help you deploy your own services.
-For more details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
-
-## Coati7B examples
-
-### Generation
-
-<details><summary><b>E-mail</b></summary>
-
-![phd](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/Phd.png)
-
-</details>
-
-<details><summary><b>coding</b></summary>
-
-![sort](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/quick_sort.png)
-
-</details>
-
-<details><summary><b>regex</b></summary>
-
-![regex](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/regex.png)
-
-</details>
-
-<details><summary><b>Tex</b></summary>
-
-![tex](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/tex.png)
-
-</details>
-
-<details><summary><b>writing</b></summary>
-
-![writing](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/writing.png)
-
-</details>
-
-<details><summary><b>Table</b></summary>
-
-![Table](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/table.png)
-
-</details>
-
-### Open QA
-
-<details><summary><b>Game</b></summary>
-
-![Game](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/game.png)
-
-</details>
-
-<details><summary><b>Travel</b></summary>
-
-![Travel](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/travel.png)
-
-</details>
-
-<details><summary><b>Physical</b></summary>
-
-![Physical](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/physical.png)
-
-</details>
-
-<details><summary><b>Chemical</b></summary>
-
-![Chemical](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/chemical.png)
-
-</details>
-
-<details><summary><b>Economy</b></summary>
-
-![Economy](https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/economy.png)
-
-</details>
-
-You can find more examples in this [repo](https://github.com/XueFuzhao/InstructionWild/blob/main/comparison.md).
-
-### Limitation
-
-<details><summary><b>Limitation for LLaMA-finetuned models</b></summary>
- Both Alpaca and ColossalChat are based on LLaMA. It is hard to compensate for the missing knowledge in the pre-training stage.
- Lack of counting ability: Cannot count the number of items in a list.
- Lack of Logics (reasoning and calculation)
- Tend to repeat the last sentence (fail to produce the end token).
- Poor multilingual results: LLaMA is mainly trained on English datasets (Generation performs better than QA).
-</details>
-
-<details><summary><b>Limitation of dataset</b></summary>
- Lack of summarization ability: No such instructions in finetune datasets.
- Lack of multi-turn chat: No such instructions in finetune datasets
- Lack of self-recognition: No such instructions in finetune datasets
- Lack of Safety:
-  - When the input contains fake facts, the model makes up false facts and explanations.
-  - Cannot abide by OpenAI's policy: When generating prompts from OpenAI API, it always abides by its policy. So no violation case is in the datasets.
-</details>
-
-## FAQ
-
-<details><summary><b>How to save/load checkpoint</b></summary>
-
-We have integrated the Transformers save and load pipeline, allowing users to freely call Hugging Face's language models and save them in the HF format.
-
-```python
-from coati.models.llama import LlamaLM
-from coati.trainer import SFTTrainer
-
-model = LlamaLM(pretrained=args.pretrain)
-tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
-
-(model, optim) = strategy.prepare((model, optim))
-trainer = SFTTrainer(model=model,
-                     strategy=strategy,
-                     optim=optim,
-                     train_dataloader=train_dataloader,
-                     eval_dataloader=eval_dataloader,
-                     batch_size=args.batch_size,
-                     max_epochs=args.max_epochs,
-                     accumulation_steps=args.accumulation_steps
-                     )
-
-trainer.fit()
-# this saves in pytorch format
-strategy.save_model(model, args.save_path, only_rank0=True)
-
-# this saves in HF format
-strategy.save_pretrained(model, args.save_path, only_rank0=True, tokenizer=tokenizer)
-```
-
-</details>
-
-<details><summary><b>How to train with limited resources</b></summary>
-
-Here are some examples that can allow you to train a 7B model on a single or multiple consumer-grade GPUs.
-
-If you only have a single 24G GPU, you can use the following script. `batch_size`, `lora_rank` and `grad_checkpoint` are the most important parameters to successfully train the model.
-
-```bash
-// [INFO]: MAX GPU MEMORY ALLOCATED:  19148.9345703125 MB
-torchrun --standalone --nproc_per_node=1 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy ddp \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
-    --batch_size 1 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1 \
-    --lora_rank 16 \
-    --grad_checkpoint
-```
-
-`colossalai_gemini` strategy can enable a single 24G GPU to train the whole model without using LoRA if you have sufficient CPU memory. You can use the following script.
-
-```bash
-torchrun --standalone --nproc_per_node=1 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_gemini \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
-    --batch_size 1 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1 \
-    --grad_checkpoint
-```
-
-If you have 4x32 GB GPUs, you can even train the whole 7B model using our `colossalai_zero2_cpu` strategy! The script is given as follows.
-
-```bash
-torchrun --standalone --nproc_per_node=4 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2_cpu \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
-    --batch_size 1 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1 \
-    --grad_checkpoint
-```
-
-</details>
-
-## The Plan
-
- [x] implement PPO fine-tuning
- [x] implement training reward model
- [x] support LoRA
- [x] support inference
- [x] support llama from [facebook](https://github.com/facebookresearch/llama)
- [x] implement PPO-ptx fine-tuning
- [ ] integrate with Ray
- [ ] support more RL paradigms, like Implicit Language Q-Learning (ILQL),
- [ ] support chain-of-thought by [langchain](https://github.com/hwchase17/langchain)
-
-### Real-time progress
-
-You will find our progress in github [project broad](https://github.com/orgs/hpcaitech/projects/17/views/1).
-
-## Invitation to open-source contribution
-
-Referring to the successful attempts of [BLOOM](https://bigscience.huggingface.co/) and [Stable Diffusion](https://en.wikipedia.org/wiki/Stable_Diffusion), any and all developers and partners with computing powers, datasets, models are welcome to join and build the Colossal-AI community, making efforts towards the era of big AI models from the starting point of replicating ChatGPT!
-
-You may contact us or participate in the following ways:
-
-1. [Leaving a Star ⭐](https://github.com/hpcaitech/ColossalAI/stargazers) to show your like and support. Thanks!
-2. Posting an [issue](https://github.com/hpcaitech/ColossalAI/issues/new/choose), or submitting a PR on GitHub follow the guideline in [Contributing](https://github.com/hpcaitech/ColossalAI/blob/main/CONTRIBUTING.md).
-3. Join the Colossal-AI community on
-   [Slack](https://github.com/hpcaitech/public_assets/tree/main/colossalai/contact/slack),
-   and [WeChat(微信)](https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png "qrcode") to share your ideas.
-4. Send your official proposal to email contact@hpcaitech.com
-
-Thanks so much to all of our amazing contributors!
-
-## Quick Preview
-
-<div align="center">
-   <a href="https://chat.colossalai.org/">
-   <img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/Chat-demo.png" width="700" />
-   </a>
-</div>
-
- An open-source low-cost solution for cloning [ChatGPT](https://openai.com/blog/chatgpt/) with a complete RLHF pipeline. [[demo]](https://chat.colossalai.org)
-
-<p id="ChatGPT_scaling" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT%20scaling.png" width=800/>
-</p>
-
- Up to 7.73 times faster for single server training and 1.42 times faster for single-GPU inference
-
-<p id="ChatGPT-1GPU" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/ChatGPT-1GPU.jpg" width=450/>
-</p>
-
- Up to 10.3x growth in model capacity on one GPU
- A mini demo training process requires only 1.62GB of GPU memory (any consumer-grade GPU)
-
-<p id="inference" align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chatgpt/LoRA%20data.jpg" width=600/>
-</p>
-
- Increase the capacity of the fine-tuning model by up to 3.7 times on a single GPU
- Keep in a sufficiently high running speed
-
-|  Model Pair   | Alpaca-7B ⚔ Coati-7B | Coati-7B ⚔ Alpaca-7B |
-| :-----------: | :------------------: | :------------------: |
-| Better Cases  |     38 ⚔ **41**      |     **45** ⚔ 33      |
-|   Win Rate    |    48% ⚔ **52%**     |    **58%** ⚔ 42%     |
-| Average Score |   7.06 ⚔ **7.13**    |   **7.31** ⚔ 6.82    |
-
- Our Coati-7B model performs better than Alpaca-7B when using GPT-4 to evaluate model performance. The Coati-7B model we evaluate is an old version we trained a few weeks ago and the new version is around the corner.
-
-## Authors
-
-Coati is developed by ColossalAI Team:
-
- [Fazzie](https://fazzie-key.cool/about/index.html)
- [FrankLeeeee](https://github.com/FrankLeeeee)
- [BlueRum](https://github.com/ht-zhou)
- [ver217](https://github.com/ver217)
- [ofey404](https://github.com/ofey404)
- [Wenhao Chen](https://github.com/CWHer)
-
-The PhD student from [(HPC-AI) Lab](https://ai.comp.nus.edu.sg/) also contributed a lot to this project.
- [Zangwei Zheng](https://github.com/zhengzangw)
- [Xue Fuzhao](https://github.com/XueFuzhao)
-
-## Citations
-
-```bibtex
-@article{Hu2021LoRALA,
-    title   = {LoRA: Low-Rank Adaptation of Large Language Models},
-    author  = {Edward J. Hu and Yelong Shen and Phillip Wallis and Zeyuan Allen-Zhu and Yuanzhi Li and Shean Wang and Weizhu Chen},
-    journal = {ArXiv},
-    year    = {2021},
-    volume  = {abs/2106.09685}
-}
-
-@article{ouyang2022training,
-  title={Training language models to follow instructions with human feedback},
-  author={Ouyang, Long and Wu, Jeff and Jiang, Xu and Almeida, Diogo and Wainwright, Carroll L and Mishkin, Pamela and Zhang, Chong and Agarwal, Sandhini and Slama, Katarina and Ray, Alex and others},
-  journal={arXiv preprint arXiv:2203.02155},
-  year={2022}
-}
-
-@article{touvron2023llama,
-  title={LLaMA: Open and Efficient Foundation Language Models},
-  author={Touvron, Hugo and Lavril, Thibaut and Izacard, Gautier and Martinet, Xavier and Lachaux, Marie-Anne and Lacroix, Timoth{\'e}e and Rozi{\`e}re, Baptiste and Goyal, Naman and Hambro, Eric and Azhar, Faisal and Rodriguez, Aurelien and Joulin, Armand and Grave, Edouard and Lample, Guillaume},
-  journal={arXiv preprint arXiv:2302.13971},
-  year={2023}
-}
-
-@misc{alpaca,
-  author = {Rohan Taori and Ishaan Gulrajani and Tianyi Zhang and Yann Dubois and Xuechen Li and Carlos Guestrin and Percy Liang and Tatsunori B. Hashimoto },
-  title = {Stanford Alpaca: An Instruction-following LLaMA model},
-  year = {2023},
-  publisher = {GitHub},
-  journal = {GitHub repository},
-  howpublished = {\url{https://github.com/tatsu-lab/stanford_alpaca}},
-}
-
-@misc{instructionwild,
-  author = {Fuzhao Xue and Zangwei Zheng and Yang You },
-  title = {Instruction in the Wild: A User-based Instruction Dataset},
-  year = {2023},
-  publisher = {GitHub},
-  journal = {GitHub repository},
-  howpublished = {\url{https://github.com/XueFuzhao/InstructionWild}},
-}
-```
-
-## Licenses
-
-Coati is licensed under the [Apache 2.0 License](LICENSE).
--- a/applications/Chat/benchmarks/README.md
+++ b/applications/Chat/benchmarks/README.md
@ -1,38 +0,0 @@
-# Benchmarks
-
-## Benchmark OPT with LoRA on dummy prompt data
-
-We provide various OPT models (string in parentheses is the corresponding model name used in this script):
-
- OPT-125M (125m)
- OPT-350M (350m)
- OPT-700M (700m)
- OPT-1.3B (1.3b)
- OPT-2.7B (2.7b)
- OPT-3.5B (3.5b)
- OPT-5.5B (5.5b)
- OPT-6.7B (6.7b)
- OPT-10B (10b)
- OPT-13B (13b)
-
-We also provide various training strategies:
-
- ddp: torch DDP
- colossalai_gemini: ColossalAI GeminiDDP with `placement_policy="cuda"`, like zero3
- colossalai_gemini_cpu: ColossalAI GeminiDDP with `placement_policy="cpu"`, like zero3-offload
- colossalai_zero2: ColossalAI zero2
- colossalai_zero2_cpu: ColossalAI zero2-offload
- colossalai_zero1: ColossalAI zero1
- colossalai_zero1_cpu: ColossalAI zero1-offload
-
-We only support `torchrun` to launch now. E.g.
-
-```bash
-# run OPT-125M with no lora (lora_rank=0) on single-node single-GPU with min batch size
-torchrun --standalone --nproc_per_node 1 benchmark_opt_lora_dummy.py \
-    --model 125m --critic_model 125m --strategy ddp \
-    --experience_batch_size 1 --train_batch_size 1 --lora_rank 0
-# run Actor (OPT-1.3B) and Critic (OPT-350M) with lora_rank=4 on single-node 4-GPU
-torchrun --standalone --nproc_per_node 4 benchmark_opt_lora_dummy.py \
-    --model 1.3b --critic_model 350m --strategy colossalai_zero2 --lora_rank 4
-```
--- a/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
+++ b/applications/Chat/benchmarks/benchmark_opt_lora_dummy.py
@ -1,208 +0,0 @@
-import argparse
-from copy import deepcopy
-
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from coati.models.base import RewardModel
-from coati.models.opt import OPTActor, OPTCritic
-from coati.trainer import PPOTrainer
-from coati.trainer.callbacks import PerformanceEvaluator
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy, Strategy
-from torch.optim import Adam
-from torch.utils.data import DataLoader
-from transformers import AutoTokenizer
-from transformers.models.opt.configuration_opt import OPTConfig
-
-from colossalai.nn.optimizer import HybridAdam
-
-
-def get_model_numel(model: nn.Module, strategy: Strategy) -> int:
-    numel = sum(p.numel() for p in model.parameters())
-    if isinstance(strategy, GeminiStrategy) and strategy.shard_init:
-        numel *= dist.get_world_size()
-    return numel
-
-
-def preprocess_batch(samples) -> dict:
-    input_ids = torch.stack(samples)
-    attention_mask = torch.ones_like(input_ids, dtype=torch.long)
-    return {"input_ids": input_ids, "attention_mask": attention_mask}
-
-
-def print_rank_0(*args, **kwargs) -> None:
-    if dist.get_rank() == 0:
-        print(*args, **kwargs)
-
-
-def print_model_numel(model_dict: dict) -> None:
-    B = 1024**3
-    M = 1024**2
-    K = 1024
-    outputs = ""
-    for name, numel in model_dict.items():
-        outputs += f"{name}: "
-        if numel >= B:
-            outputs += f"{numel / B:.2f} B\n"
-        elif numel >= M:
-            outputs += f"{numel / M:.2f} M\n"
-        elif numel >= K:
-            outputs += f"{numel / K:.2f} K\n"
-        else:
-            outputs += f"{numel}\n"
-    print_rank_0(outputs)
-
-
-def get_gpt_config(model_name: str) -> OPTConfig:
-    model_map = {
-        "125m": OPTConfig.from_pretrained("facebook/opt-125m"),
-        "350m": OPTConfig(hidden_size=1024, ffn_dim=4096, num_hidden_layers=24, num_attention_heads=16),
-        "700m": OPTConfig(hidden_size=1280, ffn_dim=5120, num_hidden_layers=36, num_attention_heads=20),
-        "1.3b": OPTConfig.from_pretrained("facebook/opt-1.3b"),
-        "2.7b": OPTConfig.from_pretrained("facebook/opt-2.7b"),
-        "3.5b": OPTConfig(hidden_size=3072, ffn_dim=12288, num_hidden_layers=32, num_attention_heads=32),
-        "5.5b": OPTConfig(hidden_size=3840, ffn_dim=15360, num_hidden_layers=32, num_attention_heads=32),
-        "6.7b": OPTConfig.from_pretrained("facebook/opt-6.7b"),
-        "10b": OPTConfig(hidden_size=5120, ffn_dim=20480, num_hidden_layers=32, num_attention_heads=32),
-        "13b": OPTConfig.from_pretrained("facebook/opt-13b"),
-    }
-    try:
-        return model_map[model_name]
-    except KeyError:
-        raise ValueError(f'Unknown model "{model_name}"')
-
-
-def main(args):
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static",initial_scale=2**5)
-    elif args.strategy == "colossalai_gemini_cpu":
-        strategy = GeminiStrategy(placement_policy="static", offload_optim_frac=1.0, offload_param_frac=1.0, initial_scale=2**5)
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    elif args.strategy == "colossalai_zero2_cpu":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
-    elif args.strategy == "colossalai_zero1":
-        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cuda")
-    elif args.strategy == "colossalai_zero1_cpu":
-        strategy = LowLevelZeroStrategy(stage=1, placement_policy="cpu")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    torch.cuda.set_per_process_memory_fraction(args.cuda_mem_frac)
-
-    model_config = get_gpt_config(args.model)
-    critic_config = get_gpt_config(args.critic_model)
-    with strategy.model_init_context():
-        actor = OPTActor(config=model_config, lora_rank=args.lora_rank).cuda()
-        critic = OPTCritic(config=critic_config, lora_rank=args.lora_rank).cuda()
-
-        initial_model = deepcopy(actor).cuda().half()
-        reward_model = RewardModel(deepcopy(critic.model), deepcopy(critic.value_head)).cuda().half()
-
-    if args.use_kernels:
-        from coati.kernels import convert_to_xformer_model
-
-        actor, critic, initial_model, reward_model = map(
-            convert_to_xformer_model, (actor, critic, initial_model, reward_model)
-        )
-
-    actor_numel = get_model_numel(actor, strategy)
-    critic_numel = get_model_numel(critic, strategy)
-    initial_model_numel = get_model_numel(initial_model, strategy)
-    reward_model_numel = get_model_numel(reward_model, strategy)
-    print_model_numel(
-        {
-            "Actor": actor_numel,
-            "Critic": critic_numel,
-            "Initial model": initial_model_numel,
-            "Reward model": reward_model_numel,
-        }
-    )
-    performance_evaluator = PerformanceEvaluator(
-        actor_numel,
-        critic_numel,
-        initial_model_numel,
-        reward_model_numel,
-        enable_grad_checkpoint=False,
-        ignore_episodes=1,
-    )
-
-    if args.strategy.startswith("colossalai"):
-        actor_optim = HybridAdam(actor.parameters(), lr=5e-6)
-        critic_optim = HybridAdam(critic.parameters(), lr=5e-6)
-    else:
-        actor_optim = Adam(actor.parameters(), lr=5e-6)
-        critic_optim = Adam(critic.parameters(), lr=5e-6)
-
-    tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-    tokenizer.pad_token = tokenizer.eos_token
-    tokenizer.padding_side = "left"
-
-    (actor, actor_optim), (critic, critic_optim) = strategy.prepare((actor, actor_optim), (critic, critic_optim))
-
-    random_prompts = torch.randint(tokenizer.vocab_size, (1000, 256), device=torch.cuda.current_device())
-    dataloader = DataLoader(
-        random_prompts, batch_size=args.experience_batch_size, shuffle=True, collate_fn=preprocess_batch
-    )
-
-    trainer = PPOTrainer(
-        strategy,
-        actor,
-        critic,
-        reward_model,
-        initial_model,
-        actor_optim,
-        critic_optim,
-        tokenizer=tokenizer,
-        ptx_coef=0,
-        train_batch_size=args.train_batch_size,
-        offload_inference_models=args.offload_inference_models,
-        max_length=512,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        use_cache=True,
-        callbacks=[performance_evaluator],
-    )
-
-    trainer.fit(
-        prompt_dataloader=dataloader,
-        pretrain_dataloader=None,
-        num_episodes=args.num_episodes,
-        num_update_steps=args.num_update_steps,
-        num_collect_steps=args.num_collect_steps,
-    )
-
-    print_rank_0(f"Peak CUDA mem: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="125m")
-    parser.add_argument("--critic_model", default="125m")
-    parser.add_argument(
-        "--strategy",
-        choices=[
-            "ddp",
-            "colossalai_gemini",
-            "colossalai_gemini_cpu",
-            "colossalai_zero2",
-            "colossalai_zero2_cpu",
-            "colossalai_zero1",
-            "colossalai_zero1_cpu",
-        ],
-        default="ddp",
-    )
-    parser.add_argument("--num_episodes", type=int, default=3)
-    parser.add_argument("--num_collect_steps", type=int, default=8)
-    parser.add_argument("--num_update_steps", type=int, default=1)
-    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument("--experience_batch_size", type=int, default=8)
-    parser.add_argument("--lora_rank", type=int, default=0)
-    parser.add_argument("--cuda_mem_frac", type=float, default=1.0)
-    parser.add_argument("--offload_inference_models", action="store_true", default=False)
-    parser.add_argument("--use_kernels", action="store_true", default=False)
-    args = parser.parse_args()
-    main(args)
--- a/applications/Chat/coati/dataset/init.py
+++ b/applications/Chat/coati/dataset/init.py
@ -1,13 +0,0 @@
-from .prompt_dataset import PromptDataset
-from .reward_dataset import HhRlhfDataset, RmStaticDataset
-from .sft_dataset import SFTDataset, SupervisedDataset
-from .utils import is_rank_0
-
-__all__ = [
-    "RmStaticDataset",
-    "HhRlhfDataset",
-    "SFTDataset",
-    "SupervisedDataset",
-    "PromptDataset",
-    "is_rank_0",
-]
--- a/applications/Chat/coati/dataset/prompt_dataset.py
+++ b/applications/Chat/coati/dataset/prompt_dataset.py
@ -1,45 +0,0 @@
-from collections import defaultdict
-from typing import Dict
-
-import torch
-import transformers
-from torch.utils.data import Dataset
-
-from colossalai.logging import get_dist_logger
-
-from .utils import jload
-
-
-class PromptDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(
-        self,
-        data_path: str,
-        tokenizer: transformers.PreTrainedTokenizer,
-        max_datasets_size: int = None,
-        max_length: int = 96,
-    ):
-        super(PromptDataset, self).__init__()
-        self.keyed_prompt = defaultdict(list)
-        self.logger = get_dist_logger()
-        self.logger.info("Loading data...")
-        list_data_dict = jload(data_path)
-        self.logger.info(f"Loaded {len(list_data_dict)} examples.")
-
-        if max_datasets_size is not None:
-            self.logger.info(f"Limiting dataset to {max_datasets_size} examples.")
-            list_data_dict = list_data_dict[:max_datasets_size]
-
-        instructions = [data_dict["instruction"] for data_dict in list_data_dict]
-        tokens = tokenizer(
-            instructions, return_tensors="pt", max_length=max_length, padding="max_length", truncation=True
-        )
-        for k, tensor in tokens.items():
-            self.keyed_prompt[k] = tensor.to(torch.cuda.current_device()).unbind()
-
-    def __len__(self):
-        return len(self.keyed_prompt["input_ids"])
-
-    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
-        return {k: v[i] for k, v in self.keyed_prompt.items()}
--- a/applications/Chat/coati/dataset/reward_dataset.py
+++ b/applications/Chat/coati/dataset/reward_dataset.py
@ -1,88 +0,0 @@
-from typing import Callable
-
-from torch.utils.data import Dataset
-from tqdm import tqdm
-
-from .utils import is_rank_0
-
-
-# Dahoas/rm-static
-class RmStaticDataset(Dataset):
-    """
-    Dataset for reward model
-
-    Args:
-        dataset: dataset for reward model
-        tokenizer: tokenizer for reward model
-        max_length: max length of input
-        special_token: special token at the end of sentence
-    """
-
-    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
-        super().__init__()
-        self.end_token = tokenizer.eos_token if special_token is None else special_token
-
-        chosen = [data["prompt"] + data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
-        chosen_token = tokenizer(
-            chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
-
-        reject = [data["prompt"] + data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
-        reject_token = tokenizer(
-            reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
-
-    def __len__(self):
-        length = self.chosen["input_ids"].shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        return (
-            self.chosen["input_ids"][idx],
-            self.chosen["attention_mask"][idx],
-            self.reject["input_ids"][idx],
-            self.reject["attention_mask"][idx],
-        )
-
-
-# Anthropic/hh-rlhf
-class HhRlhfDataset(Dataset):
-    """
-    Dataset for reward model
-
-    Args:
-        dataset: dataset for reward model
-        tokenizer: tokenizer for reward model
-        max_length: max length of input
-        special_token: special token at the end of sentence
-    """
-
-    def __init__(self, dataset, tokenizer: Callable, max_length: int, special_token=None) -> None:
-        super().__init__()
-        self.end_token = tokenizer.eos_token if special_token is None else special_token
-
-        chosen = [data["chosen"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
-        chosen_token = tokenizer(
-            chosen, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.chosen = {"input_ids": chosen_token["input_ids"], "attention_mask": chosen_token["attention_mask"]}
-
-        reject = [data["rejected"] + self.end_token for data in tqdm(dataset, disable=not is_rank_0())]
-        reject_token = tokenizer(
-            reject, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-        )
-        self.reject = {"input_ids": reject_token["input_ids"], "attention_mask": reject_token["attention_mask"]}
-
-    def __len__(self):
-        length = self.chosen["input_ids"].shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        return (
-            self.chosen["input_ids"][idx],
-            self.chosen["attention_mask"][idx],
-            self.reject["input_ids"][idx],
-            self.reject["attention_mask"][idx],
-        )
--- a/applications/Chat/coati/dataset/sft_dataset.py
+++ b/applications/Chat/coati/dataset/sft_dataset.py
@ -1,198 +0,0 @@
-#    Copyright 2023 Rohan Taori, Ishaan Gulrajani, Tianyi Zhang, Yann Dubois, Xuechen Li
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-import copy
-from typing import Dict, Optional, Sequence, Tuple
-
-import torch
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
-from torch.utils.data import Dataset
-from tqdm import tqdm
-from transformers import PreTrainedTokenizer
-
-from colossalai.logging import get_dist_logger
-
-from .utils import is_rank_0, jload
-
-logger = get_dist_logger()
-
-IGNORE_INDEX = -100
-PROMPT_DICT = {
-    "prompt_input": (
-        "Below is an instruction that describes a task, paired with an input that provides further context. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:"
-    ),
-    "prompt_no_input": (
-        "Below is an instruction that describes a task. "
-        "Write a response that appropriately completes the request.\n\n"
-        "### Instruction:\n{instruction}\n\n### Response:"
-    ),
-}
-
-
-def _preprocess(
-    sources: Sequence[str],
-    targets: Sequence[str],
-    tokenizer: PreTrainedTokenizer,
-    max_length: int,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """Preprocess the data by tokenizing."""
-    sequences = [s + t for s, t in zip(sources, targets)]
-    sequences_token = tokenizer(
-        sequences, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-    )
-    sources_token = tokenizer(
-        sources, max_length=max_length, padding="max_length", truncation=True, return_tensors="pt"
-    )
-
-    assert sequences_token["attention_mask"].dim() == 2, "seq2seq model should be preprocessed differently"
-    labels = copy.deepcopy(sequences_token["input_ids"])
-    for i in range(labels.shape[0]):
-        source_len = sources_token["attention_mask"][i].sum().item()
-        pad_len = max_length - sequences_token["attention_mask"][i].sum().item()
-        if tokenizer.padding_side == "right":
-            # |prompt|completion|eos|pad|
-            labels[i][:source_len] = IGNORE_INDEX
-            labels[i][-pad_len:] = IGNORE_INDEX
-        elif tokenizer.padding_side == "left":
-            # |pad|prompt|completion|eos|
-            labels[i][: pad_len + source_len] = IGNORE_INDEX
-        else:
-            raise RuntimeError()
-
-    return sequences_token["input_ids"], labels, sequences_token["attention_mask"]
-
-
-def _preprocess_chatglm(
-    sources: Sequence[str],
-    targets: Sequence[str],
-    tokenizer: PreTrainedTokenizer,
-    max_length: int,
-) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
-    """
-    Preprocess the data by tokenizing.
-    None for attention mask, ChatGLM will calculate attention mask according to input ids
-    """
-
-    labels = []
-    input_ids = []
-    for source, target in zip(sources, targets):
-        source_id = tokenizer.encode(text=source, add_special_tokens=False)
-        target_id = tokenizer.encode(text=target, add_special_tokens=False)
-        input_id = tokenizer.build_inputs_with_special_tokens(source_id, target_id)
-        # truncate
-        sp_token_list = [tokenizer.gmask_token_id, tokenizer.bos_token_id]
-        truncate_length = max(0, len(input_id) - max_length)
-        input_id = input_id[truncate_length:]
-        if truncate_length == len(source_id) + 1:
-            input_id = sp_token_list + input_id[1:]
-        elif truncate_length > len(source_id) + 1:
-            input_id = sp_token_list + input_id[2:]
-
-        context_length = input_id.index(tokenizer.bos_token_id)
-        mask_position = context_length - 1
-        label = [IGNORE_INDEX] * context_length + input_id[mask_position + 1 :]
-
-        pad_len = max_length - len(input_id)
-        input_id = input_id + [tokenizer.pad_token_id] * pad_len
-        input_ids.append(input_id)
-        labels.append(label + [IGNORE_INDEX] * pad_len)
-    return torch.tensor(input_ids), torch.tensor(labels), None
-
-
-class SFTDataset(Dataset):
-    """
-    Dataset for sft model
-
-    Args:
-        dataset: dataset for supervised model
-        tokenizer: tokenizer for supervised model
-        max_length: max length of input
-    """
-
-    def __init__(self, dataset: Dict, tokenizer: PreTrainedTokenizer, max_length: int = 512) -> None:
-        super().__init__()
-        self.input_ids = []
-
-        sources = [data["prompt"] for data in dataset]
-        targets = [data["completion"] + tokenizer.eos_token for data in tqdm(dataset, disable=not is_rank_0())]
-
-        logger.info("Tokenizing inputs... This may take some time...")
-        if isinstance(tokenizer, ChatGLMTokenizer):
-            self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
-                sources, targets, tokenizer, max_length
-            )
-        else:
-            self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
-
-        logger.info("Loaded dataset.")
-
-    def __len__(self):
-        length = self.input_ids.shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        if self.attention_mask is not None:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
-        else:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])
-
-
-class SupervisedDataset(Dataset):
-    """Dataset for supervised fine-tuning."""
-
-    def __init__(
-        self,
-        data_path: str,
-        tokenizer: PreTrainedTokenizer,
-        max_datasets_size: Optional[int] = None,
-        max_length: int = 512,
-    ):
-        super().__init__()
-        logger.info("Loading data...")
-        list_data_dict = jload(data_path)
-        logger.info(f"Loaded {len(list_data_dict)} examples.")
-
-        if max_datasets_size is not None:
-            logger.info(f"Limiting dataset to {max_datasets_size} examples.")
-            list_data_dict = list_data_dict[:max_datasets_size]
-
-        logger.info("Formatting inputs...")
-        prompt_input, prompt_no_input = PROMPT_DICT["prompt_input"], PROMPT_DICT["prompt_no_input"]
-        sources = [
-            prompt_input.format_map(example) if "input" in example else prompt_no_input.format_map(example)
-            for example in list_data_dict
-        ]
-        targets = [example["output"] + tokenizer.eos_token for example in list_data_dict]
-
-        logger.info("Tokenizing inputs... This may take some time...")
-        if isinstance(tokenizer, ChatGLMTokenizer):
-            self.input_ids, self.labels, self.attention_mask = _preprocess_chatglm(
-                sources, targets, tokenizer, max_length
-            )
-        else:
-            self.input_ids, self.labels, self.attention_mask = _preprocess(sources, targets, tokenizer, max_length)
-
-        logger.info("Loaded dataset.")
-
-    def __len__(self):
-        length = self.input_ids.shape[0]
-        return length
-
-    def __getitem__(self, idx):
-        if self.attention_mask is not None:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx], attention_mask=self.attention_mask[idx])
-        else:
-            return dict(input_ids=self.input_ids[idx], labels=self.labels[idx])
--- a/applications/Chat/coati/dataset/utils.py
+++ b/applications/Chat/coati/dataset/utils.py
@ -1,22 +0,0 @@
-import io
-import json
-
-import torch.distributed as dist
-
-
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
-
-
-def _make_r_io_base(f, mode: str):
-    if not isinstance(f, io.IOBase):
-        f = open(f, mode=mode)
-    return f
-
-
-def jload(f, mode="r"):
-    """Load a .json file into a dictionary."""
-    f = _make_r_io_base(f, mode)
-    jdict = json.load(f)
-    f.close()
-    return jdict
--- a/applications/Chat/coati/experience_maker/naive.py
+++ b/applications/Chat/coati/experience_maker/naive.py
@ -1,71 +0,0 @@
-import torch
-import torch.nn.functional as F
-from coati.models.base import Actor, Critic, RewardModel
-from coati.models.generation import generate
-from coati.models.utils import calc_action_log_probs, compute_reward
-from transformers import PreTrainedTokenizer
-
-from .base import Experience, ExperienceMaker
-
-
-class NaiveExperienceMaker(ExperienceMaker):
-    """
-    Naive experience maker.
-    """
-
-    def __init__(
-        self,
-        actor: Actor,
-        critic: Critic,
-        reward_model: RewardModel,
-        initial_model: Actor,
-        tokenizer: PreTrainedTokenizer,
-        kl_coef: float = 0.1,
-    ) -> None:
-        super().__init__(actor, critic, reward_model, initial_model)
-        self.tokenizer = tokenizer
-        self.kl_coef = kl_coef
-
-    @torch.no_grad()
-    def make_experience(self, input_ids: torch.Tensor, **generate_kwargs) -> Experience:
-        self.actor.eval()
-        self.critic.eval()
-        self.initial_model.eval()
-        self.reward_model.eval()
-
-        # generate sequences
-        sequences = generate(self.actor, input_ids, self.tokenizer, **generate_kwargs)
-
-        # calculate auxiliary tensors
-        attention_mask = None
-        pad_token_id = self.tokenizer.pad_token_id
-        if pad_token_id is not None:
-            attention_mask = sequences.not_equal(pad_token_id).to(dtype=torch.long, device=sequences.device)
-
-        input_len = input_ids.size(1)
-        eos_token_id = self.tokenizer.eos_token_id
-        if eos_token_id is None:
-            action_mask = torch.ones_like(sequences, dtype=torch.bool)
-        else:
-            # left padding may be applied, only mask action
-            action_mask = (sequences[:, input_len:] == eos_token_id).cumsum(dim=-1) == 0
-            action_mask = F.pad(action_mask, (1 + input_len, -1), value=True)  # include eos token and input
-        action_mask[:, :input_len] = False
-        action_mask = action_mask[:, 1:]
-        action_mask = action_mask[:, -(sequences.size(1) - input_len) :]
-        num_actions = action_mask.size(1)
-
-        actor_output = self.actor(sequences, attention_mask)["logits"]
-        action_log_probs = calc_action_log_probs(actor_output, sequences, num_actions)
-        base_model_output = self.initial_model(sequences, attention_mask)["logits"]
-        base_action_log_probs = calc_action_log_probs(base_model_output, sequences, num_actions)
-        value = self.critic(sequences, attention_mask)
-        r = self.reward_model(sequences, attention_mask)
-        reward = compute_reward(r, self.kl_coef, action_log_probs, base_action_log_probs, action_mask=action_mask)
-
-        advantage = reward - value
-        # TODO(ver217): maybe normalize adv
-        if advantage.ndim == 1:
-            advantage = advantage.unsqueeze(-1)
-
-        return Experience(sequences, action_log_probs, value, reward, advantage, attention_mask, action_mask)
--- a/applications/Chat/coati/kernels/init.py
+++ b/applications/Chat/coati/kernels/init.py
@ -1,6 +0,0 @@
-from .wrapper import convert_to_xformer_model, recover_from_xformer_model
-
-__all__ = [
-    "convert_to_xformer_model",
-    "recover_from_xformer_model",
-]
--- a/applications/Chat/coati/kernels/opt_attn.py
+++ b/applications/Chat/coati/kernels/opt_attn.py
@ -1,90 +0,0 @@
-from typing import Optional, Tuple
-
-import torch
-import xformers.ops as xops
-from torch import Tensor
-from transformers.models.opt.modeling_opt import OPTAttention
-
-
-# This is modified from https://github.com/huggingface/transformers/blob/main/src/transformers/models/opt/modeling_opt.py
-class XOPTAttention(OPTAttention):
-    # def _shape(self, tensor: Tensor, seq_len: int, bsz: int):
-    #     return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).contiguous()
-
-    def forward(
-        self,
-        hidden_states: Tensor,
-        key_value_states: Optional[Tensor] = None,
-        past_key_value: Optional[Tensor] = None,
-        attention_mask: Optional[Tensor] = None,
-        layer_head_mask: Optional[Tensor] = None,
-        output_attentions: bool = False,
-    ) -> Tuple[Tensor, Optional[Tensor], Optional[Tuple[Tensor]]]:
-        if not self.training:
-            return super().forward(
-                hidden_states, key_value_states, past_key_value, attention_mask, layer_head_mask, output_attentions
-            )
-        """Input shape: Batch x Time x Channel"""
-        assert layer_head_mask is None, "Xformers attention does not support layer_head_mask"
-        assert not output_attentions, "Xformers attention does not support output_attentions"
-
-        # if key_value_states are provided this layer is used as a cross-attention layer
-        # for the decoder
-        is_cross_attention = key_value_states is not None
-
-        bsz, tgt_len, _ = hidden_states.size()
-
-        # get query proj
-        query_states = self.q_proj(hidden_states)
-        # get key, value proj
-        if is_cross_attention and past_key_value is not None:
-            # reuse k,v, cross_attentions
-            key_states = past_key_value[0]
-            value_states = past_key_value[1]
-        elif is_cross_attention:
-            # cross_attentions
-            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
-            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
-        elif past_key_value is not None:
-            # reuse k, v, self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-            key_states = torch.cat([past_key_value[0], key_states], dim=2)
-            value_states = torch.cat([past_key_value[1], value_states], dim=2)
-        else:
-            # self_attention
-            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
-            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
-
-        if self.is_decoder:
-            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
-            # Further calls to cross_attention layer can then reuse all cross-attention
-            # key/value_states (first "if" case)
-            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
-            # all previous decoder key/value_states. Further calls to uni-directional self-attention
-            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
-            # if encoder bi-directional self-attention `past_key_value` is always `None`
-            past_key_value = (key_states, value_states)
-
-        query_states = self._shape(query_states, tgt_len, bsz).transpose(1, 2)
-        key_states = key_states.transpose(1, 2)
-        value_states = value_states.transpose(1, 2)
-
-        attn_output = xops.memory_efficient_attention(
-            query_states,
-            key_states,
-            value_states,
-            attn_bias=xops.LowerTriangularMask(),
-            p=self.dropout if self.training else 0.0,
-            scale=self.scaling,
-        )
-
-        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
-        # partitioned across GPUs when using tensor-parallelism.
-        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        attn_weights_reshaped = None
-
-        return attn_output, attn_weights_reshaped, past_key_value
--- a/applications/Chat/coati/kernels/wrapper.py
+++ b/applications/Chat/coati/kernels/wrapper.py
@ -1,18 +0,0 @@
-import torch.nn as nn
-from transformers.models.opt.modeling_opt import OPTAttention
-
-from .opt_attn import XOPTAttention
-
-
-def convert_to_xformer_model(model: nn.Module) -> nn.Module:
-    for module in model.modules():
-        if isinstance(module, OPTAttention):
-            module.__class__ = XOPTAttention
-    return model
-
-
-def recover_from_xformer_model(model: nn.Module) -> nn.Module:
-    for module in model.modules():
-        if isinstance(module, XOPTAttention):
-            module.__class__ = OPTAttention
-    return model
--- a/applications/Chat/coati/models/init.py
+++ b/applications/Chat/coati/models/init.py
@ -1,15 +0,0 @@
-from .base import Actor, Critic, RewardModel
-from .lora import LoRAModule, convert_to_lora_module
-from .loss import LogExpLoss, LogSigLoss, PolicyLoss, ValueLoss
-
-__all__ = [
-    "Actor",
-    "Critic",
-    "RewardModel",
-    "PolicyLoss",
-    "ValueLoss",
-    "LogSigLoss",
-    "LogExpLoss",
-    "LoRAModule",
-    "convert_to_lora_module",
-]
--- a/applications/Chat/coati/models/base/init.py
+++ b/applications/Chat/coati/models/base/init.py
@ -1,27 +0,0 @@
-from typing import Union
-
-import torch.nn as nn
-
-from .actor import Actor
-from .critic import Critic
-from .reward_model import RewardModel
-
-
-def get_base_model(model: Union[Actor, Critic, RewardModel]) -> nn.Module:
-    """Get the base model of our wrapper classes.
-    For Actor, Critic and RewardModel, return ``model.model``,
-    it's usually a ``transformers.PreTrainedModel``.
-
-    Args:
-        model (nn.Module): model to get base model from
-
-    Returns:
-        nn.Module: the base model
-    """
-    assert isinstance(
-        model, (Actor, Critic, RewardModel)
-    ), f"Expect Actor, Critic or RewardModel, got {type(model)}, use unwrap_model first."
-    return model.model
-
-
-__all__ = ["Actor", "Critic", "RewardModel", "get_base_model"]
--- a/applications/Chat/coati/models/base/actor.py
+++ b/applications/Chat/coati/models/base/actor.py
@ -1,33 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from ..lora import LoRAModule
-
-
-class Actor(LoRAModule):
-    """
-    Actor model base class.
-
-    Args:
-        model (nn.Module): Actor Model.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(self, model: nn.Module, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
-        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.model = model
-        self.convert_to_lora()
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor,
-        attention_mask: Optional[torch.Tensor] = None,
-        **model_kwargs,
-    ) -> torch.Tensor:
-        """Returns model output."""
-        output = self.model(input_ids, attention_mask=attention_mask, **model_kwargs)
-        return output
-    
--- a/applications/Chat/coati/models/base/critic.py
+++ b/applications/Chat/coati/models/base/critic.py
@ -1,34 +0,0 @@
-import torch
-import torch.nn as nn
-
-from ..lora import LoRAModule
-
-
-class Critic(LoRAModule):
-    """
-    Critic model base class.
-
-    Args:
-        model (nn.Module): Critic model.
-        value_head (nn.Module): Value head to get value.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self, model: nn.Module, value_head: nn.Module, lora_rank: int = 0, lora_train_bias: str = "none"
-    ) -> None:
-        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.model = model
-        self.value_head = value_head
-        self.convert_to_lora()
-
-    def forward(self, sequences: torch.LongTensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        outputs = self.model(sequences, attention_mask=attention_mask)
-        last_hidden_states = outputs["last_hidden_state"]
-        sequence_lengths = torch.max(attention_mask * torch.arange(sequences.size(1), device=sequences.device), dim=1)[
-            0
-        ]
-        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]
-        values = self.value_head(sequence_hidden_states).squeeze(1)  # ensure shape is (B, )
-        return values
--- a/applications/Chat/coati/models/base/reward_model.py
+++ b/applications/Chat/coati/models/base/reward_model.py
@ -1,46 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from ..lora import LoRAModule
-
-
-class RewardModel(LoRAModule):
-    """
-    Reward model base class.
-
-    Args:
-        model (nn.Module): Reward model.
-        value_head (nn.Module): Value head to get reward score.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        model: nn.Module,
-        value_head: Optional[nn.Module] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        super().__init__(lora_rank=lora_rank, lora_train_bias=lora_train_bias)
-        self.model = model
-        self.convert_to_lora()
-
-        if value_head is not None:
-            if value_head.out_features != 1:
-                raise ValueError("The value head of reward model's output dim should be 1!")
-            self.value_head = value_head
-        else:
-            self.value_head = nn.Linear(model.config.n_embd, 1)
-
-    def forward(self, sequences: torch.LongTensor, attention_mask: torch.Tensor) -> torch.Tensor:
-        outputs = self.model(sequences, attention_mask=attention_mask)
-        last_hidden_states = outputs["last_hidden_state"]
-        sequence_lengths = torch.max(attention_mask * torch.arange(sequences.size(1), device=sequences.device), dim=1)[
-            0
-        ]
-        sequence_hidden_states = last_hidden_states[torch.arange(last_hidden_states.size(0)), sequence_lengths]
-        values = self.value_head(sequence_hidden_states).squeeze(1)  # ensure shape is (B, )
-        return values
--- a/applications/Chat/coati/models/bloom/init.py
+++ b/applications/Chat/coati/models/bloom/init.py
@ -1,5 +0,0 @@
-from .bloom_actor import BLOOMActor
-from .bloom_critic import BLOOMCritic
-from .bloom_rm import BLOOMRM
-
-__all__ = ["BLOOMActor", "BLOOMCritic", "BLOOMRM"]
--- a/applications/Chat/coati/models/bloom/bloom_actor.py
+++ b/applications/Chat/coati/models/bloom/bloom_actor.py
@ -1,36 +0,0 @@
-from typing import Optional
-
-from transformers import BloomConfig, BloomForCausalLM
-
-from ..base import Actor
-
-
-class BLOOMActor(Actor):
-    """
-    BLOOM Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (BloomConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: str = None,
-        config: Optional[BloomConfig] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = BloomForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = BloomForCausalLM(config)
-        else:
-            model = BloomForCausalLM(BloomConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias)
--- a/applications/Chat/coati/models/bloom/bloom_critic.py
+++ b/applications/Chat/coati/models/bloom/bloom_critic.py
@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import BloomConfig, BloomModel
-
-from ..base import Critic
-
-
-class BLOOMCritic(Critic):
-    """
-    BLOOM Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (BloomConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: str = None,
-        config: Optional[BloomConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = BloomModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = BloomModel(config)
-        else:
-            model = BloomModel(BloomConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
--- a/applications/Chat/coati/models/bloom/bloom_rm.py
+++ b/applications/Chat/coati/models/bloom/bloom_rm.py
@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import BloomConfig, BloomModel
-
-from ..base import RewardModel
-
-
-class BLOOMRM(RewardModel):
-    """
-    BLOOM Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (BloomConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: str = None,
-        config: Optional[BloomConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = BloomModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = BloomModel(config)
-        else:
-            model = BloomModel(BloomConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.hidden_size + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
--- a/applications/Chat/coati/models/chatglm/init.py
+++ b/applications/Chat/coati/models/chatglm/init.py
@ -1,3 +0,0 @@
-from .chatglm_actor import ChatGLMActor
-
-__all__ = ["ChatGLMActor"]
--- a/applications/Chat/coati/models/chatglm/chatglm_actor.py
+++ b/applications/Chat/coati/models/chatglm/chatglm_actor.py
@ -1,31 +0,0 @@
-from typing import Optional
-
-from ..base import Actor
-from .configuration_chatglm import ChatGLMConfig
-from .modeling_chatglm import ChatGLMForConditionalGeneration
-
-
-class ChatGLMActor(Actor):
-    """
-    ChatGLM Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (ChatGLMConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-
-    do not support lora for now.
-    """
-
-    def __init__(
-        self, pretrained: str = None, config: Optional[ChatGLMConfig] = None, checkpoint: bool = False
-    ) -> None:
-        if pretrained is not None:
-            model = ChatGLMForConditionalGeneration.from_pretrained(pretrained)
-        elif config is not None:
-            model = ChatGLMForConditionalGeneration(config)
-        else:
-            model = ChatGLMForConditionalGeneration(ChatGLMConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank=0, lora_train_bias="none")
--- a/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
+++ b/applications/Chat/coati/models/chatglm/chatglm_tokenizer.py
@ -1,442 +0,0 @@
-"""
-This code is copied from https://huggingface.co/THUDM/chatglm-6b/blob/main/tokenization_chatglm.py
-"""
-"""Tokenization classes for ChatGLM."""
-import os
-from typing import Dict, List, Optional, Union
-
-import numpy as np
-import sentencepiece as spm
-from transformers.tokenization_utils import PreTrainedTokenizer
-from transformers.tokenization_utils_base import BatchEncoding, EncodedInput
-from transformers.utils import PaddingStrategy, logging
-
-logger = logging.get_logger(__name__)
-
-PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
-    "THUDM/chatglm-6b": 2048,
-}
-
-
-class TextTokenizer:
-    def __init__(self, model_path):
-        self.sp = spm.SentencePieceProcessor()
-        self.sp.Load(model_path)
-        self.num_tokens = self.sp.vocab_size()
-
-    def encode(self, text):
-        return self.sp.EncodeAsIds(text)
-
-    def decode(self, ids: List[int]):
-        return self.sp.DecodeIds(ids)
-
-    def tokenize(self, text):
-        return self.sp.EncodeAsPieces(text)
-
-    def convert_tokens_to_string(self, tokens):
-        return self.sp.DecodePieces(tokens)
-
-    def convert_tokens_to_ids(self, tokens):
-        return [self.sp.PieceToId(token) for token in tokens]
-
-    def convert_token_to_id(self, token):
-        return self.sp.PieceToId(token)
-
-    def convert_id_to_token(self, idx):
-        return self.sp.IdToPiece(idx)
-
-    def __len__(self):
-        return self.num_tokens
-
-
-class SPTokenizer:
-    def __init__(
-        self,
-        vocab_file,
-        num_image_tokens=20000,
-        max_blank_length=80,
-        byte_fallback=True,
-    ):
-        assert vocab_file is not None
-        self.vocab_file = vocab_file
-        self.num_image_tokens = num_image_tokens
-        self.special_tokens = ["[MASK]", "[gMASK]", "[sMASK]", "<unused_0>", "<sop>", "<eop>", "<ENC>", "<dBLOCK>"]
-        self.max_blank_length = max_blank_length
-        self.byte_fallback = byte_fallback
-        self.text_tokenizer = TextTokenizer(vocab_file)
-
-    def _get_text_tokenizer(self):
-        return self.text_tokenizer
-
-    @staticmethod
-    def get_blank_token(length: int):
-        assert length >= 2
-        return f"<|blank_{length}|>"
-
-    @staticmethod
-    def get_tab_token():
-        return f"<|tab|>"
-
-    @property
-    def num_text_tokens(self):
-        return self.text_tokenizer.num_tokens
-
-    @property
-    def num_tokens(self):
-        return self.num_image_tokens + self.num_text_tokens
-
-    @staticmethod
-    def _encode_whitespaces(text: str, max_len: int = 80):
-        text = text.replace("\t", SPTokenizer.get_tab_token())
-        for i in range(max_len, 1, -1):
-            text = text.replace(" " * i, SPTokenizer.get_blank_token(i))
-        return text
-
-    def _preprocess(self, text: str, linebreak=True, whitespaces=True):
-        if linebreak:
-            text = text.replace("\n", "<n>")
-        if whitespaces:
-            text = self._encode_whitespaces(text, max_len=self.max_blank_length)
-        return text
-
-    def encode(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[int]:
-        """
-        @param text: Text to encode.
-        @param linebreak: Whether to encode newline (\n) in text.
-        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
-        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
-        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
-        """
-        text = self._preprocess(text, linebreak, whitespaces)
-        if not add_dummy_prefix:
-            text = "<n>" + text
-        tmp = self._get_text_tokenizer().encode(text)
-        tokens = [x + self.num_image_tokens for x in tmp]
-        return tokens if add_dummy_prefix else tokens[2:]
-
-    def postprocess(self, text):
-        text = text.replace("<n>", "\n")
-        text = text.replace(SPTokenizer.get_tab_token(), "\t")
-        for i in range(2, self.max_blank_length + 1):
-            text = text.replace(self.get_blank_token(i), " " * i)
-        return text
-
-    def decode(self, text_ids: List[int]) -> str:
-        ids = [int(_id) - self.num_image_tokens for _id in text_ids]
-        ids = [_id for _id in ids if _id >= 0]
-        text = self._get_text_tokenizer().decode(ids)
-        text = self.postprocess(text)
-        return text
-
-    def decode_tokens(self, tokens: List[str]) -> str:
-        text = self._get_text_tokenizer().convert_tokens_to_string(tokens)
-        text = self.postprocess(text)
-        return text
-
-    def tokenize(self, text: str, linebreak=True, whitespaces=True, add_dummy_prefix=True) -> List[str]:
-        """
-        @param text: Text to encode.
-        @param linebreak: Whether to encode newline (\n) in text.
-        @param whitespaces: Whether to encode multiple whitespaces or tab in text, useful for source code encoding.
-        @param special_tokens: Whether to encode special token ([MASK], [gMASK], etc.) in text.
-        @param add_dummy_prefix: Whether to add dummy blank space in the beginning.
-        """
-        text = self._preprocess(text, linebreak, whitespaces)
-        if not add_dummy_prefix:
-            text = "<n>" + text
-        tokens = self._get_text_tokenizer().tokenize(text)
-        return tokens if add_dummy_prefix else tokens[2:]
-
-    def __getitem__(self, x: Union[int, str]):
-        if isinstance(x, int):
-            if x < self.num_image_tokens:
-                return "<image_{}>".format(x)
-            else:
-                return self.text_tokenizer.convert_id_to_token(x - self.num_image_tokens)
-        elif isinstance(x, str):
-            if x.startswith("<image_") and x.endswith(">") and x[7:-1].isdigit():
-                return int(x[7:-1])
-            else:
-                return self.text_tokenizer.convert_token_to_id(x) + self.num_image_tokens
-        else:
-            raise ValueError("The key should be str or int.")
-
-
-class ChatGLMTokenizer(PreTrainedTokenizer):
-    """
-    Construct a ChatGLM tokenizer. Based on byte-level Byte-Pair-Encoding.
-
-    Args:
-        vocab_file (`str`):
-            Path to the vocabulary file.
-    """
-
-    vocab_files_names = {"vocab_file": "ice_text.model"}
-    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
-    model_input_names = ["input_ids", "attention_mask", "position_ids"]
-
-    def __init__(
-        self,
-        vocab_file,
-        do_lower_case=False,
-        remove_space=False,
-        bos_token="<sop>",
-        eos_token="<eop>",
-        end_token="</s>",
-        mask_token="[MASK]",
-        gmask_token="[gMASK]",
-        padding_side="left",
-        pad_token="<pad>",
-        unk_token="<unk>",
-        num_image_tokens=20000,
-        **kwargs,
-    ) -> None:
-        super().__init__(
-            do_lower_case=do_lower_case,
-            remove_space=remove_space,
-            padding_side=padding_side,
-            bos_token=bos_token,
-            eos_token=eos_token,
-            end_token=end_token,
-            mask_token=mask_token,
-            gmask_token=gmask_token,
-            pad_token=pad_token,
-            unk_token=unk_token,
-            num_image_tokens=num_image_tokens,
-            **kwargs,
-        )
-
-        self.do_lower_case = do_lower_case
-        self.remove_space = remove_space
-        self.vocab_file = vocab_file
-
-        self.bos_token = bos_token
-        self.eos_token = eos_token
-        self.end_token = end_token
-        self.mask_token = mask_token
-        self.gmask_token = gmask_token
-
-        self.sp_tokenizer = SPTokenizer(vocab_file, num_image_tokens=num_image_tokens)
-
-        """ Initialisation """
-
-    @property
-    def gmask_token_id(self) -> Optional[int]:
-        if self.gmask_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.gmask_token)
-
-    @property
-    def end_token_id(self) -> Optional[int]:
-        """
-        `Optional[int]`: Id of the end of context token in the vocabulary. Returns `None` if the token has not been
-        set.
-        """
-        if self.end_token is None:
-            return None
-        return self.convert_tokens_to_ids(self.end_token)
-
-    @property
-    def vocab_size(self):
-        """Returns vocab size"""
-        return self.sp_tokenizer.num_tokens
-
-    def get_vocab(self):
-        """Returns vocab as a dict"""
-        vocab = {self._convert_id_to_token(i): i for i in range(self.vocab_size)}
-        vocab.update(self.added_tokens_encoder)
-        return vocab
-
-    def preprocess_text(self, inputs):
-        if self.remove_space:
-            outputs = " ".join(inputs.strip().split())
-        else:
-            outputs = inputs
-
-        if self.do_lower_case:
-            outputs = outputs.lower()
-
-        return outputs
-
-    def _tokenize(self, text, **kwargs):
-        """Returns a tokenized string."""
-        text = self.preprocess_text(text)
-
-        seq = self.sp_tokenizer.tokenize(text)
-
-        return seq
-
-    def convert_tokens_to_string(self, tokens: List[str]) -> str:
-        return self.sp_tokenizer.decode_tokens(tokens)
-
-    def _decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
-        if isinstance(token_ids, int):
-            token_ids = [token_ids]
-        if len(token_ids) == 0:
-            return ""
-        if self.pad_token_id in token_ids:  # remove pad
-            token_ids = list(filter((self.pad_token_id).__ne__, token_ids))
-        return super()._decode(token_ids, **kwargs)
-
-    def _convert_token_to_id(self, token):
-        """Converts a token (str) in an id using the vocab."""
-        return self.sp_tokenizer[token]
-
-    def _convert_id_to_token(self, index):
-        """Converts an index (integer) in a token (str) using the vocab."""
-        return self.sp_tokenizer[index]
-
-    def save_vocabulary(self, save_directory, filename_prefix=None):
-        """
-        Save the vocabulary and special tokens file to a directory.
-
-        Args:
-            save_directory (`str`):
-                The directory in which to save the vocabulary.
-            filename_prefix (`str`, *optional*):
-                An optional prefix to add to the named of the saved files.
-
-        Returns:
-            `Tuple(str)`: Paths to the files saved.
-        """
-        if os.path.isdir(save_directory):
-            vocab_file = os.path.join(save_directory, self.vocab_files_names["vocab_file"])
-        else:
-            vocab_file = save_directory
-
-        with open(self.vocab_file, "rb") as fin:
-            proto_str = fin.read()
-
-        with open(vocab_file, "wb") as writer:
-            writer.write(proto_str)
-
-        return (vocab_file,)
-
-    def build_inputs_with_special_tokens(
-        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
-    ) -> List[int]:
-        """
-        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
-        adding special tokens. A BERT sequence has the following format:
-
-        - single sequence: `[CLS] X [SEP]`
-        - pair of sequences: `[CLS] A [SEP] B [SEP]`
-
-        Args:
-            token_ids_0 (`List[int]`):
-                List of IDs to which the special tokens will be added.
-            token_ids_1 (`List[int]`, *optional*):
-                Optional second list of IDs for sequence pairs.
-
-        Returns:
-            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
-        """
-        gmask_id = self.sp_tokenizer[self.gmask_token]
-        self.sp_tokenizer[self.eos_token]
-        token_ids_0 = token_ids_0 + [gmask_id, self.sp_tokenizer[self.bos_token]]
-        if token_ids_1 is not None:
-            token_ids_0 = token_ids_0 + token_ids_1
-        return token_ids_0
-
-    def _pad(
-        self,
-        encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
-        max_length: Optional[int] = None,
-        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
-        pad_to_multiple_of: Optional[int] = None,
-        return_attention_mask: Optional[bool] = None,
-    ) -> dict:
-        """
-        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
-
-        Args:
-            encoded_inputs:
-                Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
-            max_length: maximum length of the returned list and optionally padding length (see below).
-                Will truncate by taking into account the special tokens.
-            padding_strategy: PaddingStrategy to use for padding.
-
-                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
-                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
-                - PaddingStrategy.DO_NOT_PAD: Do not pad
-                The tokenizer padding sides are defined in self.padding_side:
-
-                    - 'left': pads on the left of the sequences
-                    - 'right': pads on the right of the sequences
-            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
-                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
-                `>= 7.5` (Volta).
-            return_attention_mask:
-                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
-        """
-        # Load from model defaults
-        bos_token_id = self.sp_tokenizer[self.bos_token]
-        mask_token_id = self.sp_tokenizer[self.mask_token]
-        gmask_token_id = self.sp_tokenizer[self.gmask_token]
-        assert self.padding_side == "left"
-
-        required_input = encoded_inputs[self.model_input_names[0]]
-        seq_length = len(required_input)
-
-        if padding_strategy == PaddingStrategy.LONGEST:
-            max_length = len(required_input)
-
-        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
-            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
-
-        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
-
-        # Initialize attention mask if not present.
-        if max_length is not None:
-            if "attention_mask" not in encoded_inputs:
-                if bos_token_id in required_input:
-                    context_length = required_input.index(bos_token_id)
-                else:
-                    context_length = seq_length
-                attention_mask = np.ones((1, seq_length, seq_length))
-                attention_mask = np.tril(attention_mask)
-                attention_mask[:, :, :context_length] = 1
-                attention_mask = np.bool_(attention_mask < 0.5)
-                encoded_inputs["attention_mask"] = attention_mask
-
-            if "position_ids" not in encoded_inputs:
-                if bos_token_id in required_input:
-                    context_length = required_input.index(bos_token_id)
-                else:
-                    context_length = seq_length
-                position_ids = np.arange(seq_length, dtype=np.int64)
-                mask_token = mask_token_id if mask_token_id in required_input else gmask_token_id
-                if mask_token in required_input:
-                    mask_position = required_input.index(mask_token)
-                    position_ids[context_length:] = mask_position
-                block_position_ids = np.concatenate(
-                    [
-                        np.zeros(context_length, dtype=np.int64),
-                        np.arange(1, seq_length - context_length + 1, dtype=np.int64),
-                    ]
-                )
-                encoded_inputs["position_ids"] = np.stack([position_ids, block_position_ids], axis=0)
-
-        if needs_to_be_padded:
-            difference = max_length - len(required_input)
-
-            if "attention_mask" in encoded_inputs:
-                encoded_inputs["attention_mask"] = np.pad(
-                    encoded_inputs["attention_mask"],
-                    pad_width=[(0, 0), (difference, 0), (difference, 0)],
-                    mode="constant",
-                    constant_values=True,
-                )
-            if "token_type_ids" in encoded_inputs:
-                encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
-                    "token_type_ids"
-                ]
-            if "special_tokens_mask" in encoded_inputs:
-                encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
-            if "position_ids" in encoded_inputs:
-                encoded_inputs["position_ids"] = np.pad(
-                    encoded_inputs["position_ids"], pad_width=[(0, 0), (difference, 0)]
-                )
-            encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
-
-        return encoded_inputs
--- a/applications/Chat/coati/models/chatglm/configuration_chatglm.py
+++ b/applications/Chat/coati/models/chatglm/configuration_chatglm.py
@ -1,101 +0,0 @@
-"""
-This code is copied from https://huggingface.co/THUDM/chatglm-6b/resolve/main/configuration_chatglm.py
-"""
-
-""" ChatGLM model configuration """
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-
-logger = logging.get_logger(__name__)
-
-
-class ChatGLMConfig(PretrainedConfig):
-    r"""
-    This is the configuration class to store the configuration of a [`~ChatGLMModel`].
-    It is used to instantiate an ChatGLM model according to the specified arguments, defining the model
-    architecture. Instantiating a configuration with the defaults will yield a similar configuration to that of
-    the ChatGLM-6B [THUDM/ChatGLM-6B](https://huggingface.co/THUDM/chatglm-6b) architecture.
-
-    Configuration objects inherit from  [`PretrainedConfig`] and can be used
-    to control the model outputs. Read the documentation from  [`PretrainedConfig`]
-    for more information.
-
-
-    Args:
-        vocab_size (`int`, *optional*, defaults to 150528):
-            Vocabulary size of the ChatGLM-6B model. Defines the number of different tokens that can be represented by the
-            `inputs_ids` passed when calling [`~ChatGLMModel`] or
-            [`~TFChatGLMModel`].
-        hidden_size (`int`, *optional*, defaults to 4096):
-            Dimension of the encoder layers and the pooler layer.
-        num_hidden_layers (`int`, *optional*, defaults to 28):
-            Number of hidden layers in the Transformer encoder.
-        num_attention_heads (`int`, *optional*, defaults to 32):
-            Number of attention heads for each attention layer in the Transformer encoder.
-        inner_hidden_size (`int`, *optional*, defaults to 16384):
-            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
-        max_sequence_length (`int`, *optional*, defaults to 512):
-            The maximum sequence length that this model might ever be used with.
-            Typically set this to something large just in case (e.g., 512 or 1024 or 2048).
-        layernorm_epsilon (`float`, *optional*, defaults to 1e-5):
-            The epsilon used by the layer normalization layers.
-        use_cache (`bool`, *optional*, defaults to `True`):
-            Whether the model should return the last key/values attentions (not used by all models).
-        Example:
-
-    ```python
-    >>> from configuration_chatglm import ChatGLMConfig
-    >>> from modeling_chatglm import ChatGLMModel
-
-    >>> # Initializing a ChatGLM-6B THUDM/ChatGLM-6B style configuration
-    >>> configuration = ChatGLMConfig()
-
-    >>> # Initializing a model from the THUDM/ChatGLM-6B style configuration
-    >>> model = ChatGLMModel(configuration)
-
-    >>> # Accessing the model configuration
-    >>> configuration = model.config
-    ```"""
-    model_type = "chatglm"
-
-    def __init__(
-        self,
-        vocab_size=130528,
-        hidden_size=4096,
-        num_layers=28,
-        num_attention_heads=32,
-        layernorm_epsilon=1e-5,
-        use_cache=True,
-        bos_token_id=130004,
-        eos_token_id=130005,
-        mask_token_id=130000,
-        gmask_token_id=130001,
-        pad_token_id=3,
-        max_sequence_length=2048,
-        inner_hidden_size=16384,
-        position_encoding_2d=True,
-        quantization_bit=0,
-        pre_seq_len=None,
-        prefix_projection=False,
-        **kwargs,
-    ):
-        self.num_layers = num_layers
-        self.vocab_size = vocab_size
-        self.hidden_size = hidden_size
-        self.num_attention_heads = num_attention_heads
-        self.max_sequence_length = max_sequence_length
-        self.layernorm_epsilon = layernorm_epsilon
-        self.inner_hidden_size = inner_hidden_size
-        self.use_cache = use_cache
-        self.bos_token_id = bos_token_id
-        self.eos_token_id = eos_token_id
-        self.pad_token_id = pad_token_id
-        self.mask_token_id = mask_token_id
-        self.gmask_token_id = gmask_token_id
-        self.position_encoding_2d = position_encoding_2d
-        self.quantization_bit = quantization_bit
-        self.pre_seq_len = pre_seq_len
-        self.prefix_projection = prefix_projection
-
-        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
--- a/applications/Chat/coati/models/chatglm/modeling_chatglm.py
+++ b/applications/Chat/coati/models/chatglm/modeling_chatglm.py
--- a/applications/Chat/coati/models/generation.py
+++ b/applications/Chat/coati/models/generation.py
@ -1,152 +0,0 @@
-from typing import Any, Callable, Optional
-
-import torch
-import torch.distributed as dist
-from transformers import PreTrainedTokenizer
-
-from .base import Actor
-
-try:
-    from transformers.generation_logits_process import (
-        LogitsProcessorList,
-        TemperatureLogitsWarper,
-        TopKLogitsWarper,
-        TopPLogitsWarper,
-    )
-except ImportError:
-    from transformers.generation import LogitsProcessorList, TemperatureLogitsWarper, TopKLogitsWarper, TopPLogitsWarper
-
-
-def _prepare_logits_processor(
-    top_k: Optional[int] = None, top_p: Optional[float] = None, temperature: Optional[float] = None
-) -> LogitsProcessorList:
-    processor_list = LogitsProcessorList()
-    if temperature is not None and temperature != 1.0:
-        processor_list.append(TemperatureLogitsWarper(temperature))
-    if top_k is not None and top_k != 0:
-        processor_list.append(TopKLogitsWarper(top_k))
-    if top_p is not None and top_p < 1.0:
-        processor_list.append(TopPLogitsWarper(top_p))
-    return processor_list
-
-
-def _is_sequence_finished(unfinished_sequences: torch.Tensor) -> bool:
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        # consider DP
-        unfinished_sequences = unfinished_sequences.clone()
-        dist.all_reduce(unfinished_sequences)
-    return unfinished_sequences.max() == 0
-
-
-def _sample(
-    model: Actor,
-    input_ids: torch.Tensor,
-    max_length: int,
-    early_stopping: bool = False,
-    eos_token_id: Optional[int] = None,
-    pad_token_id: Optional[int] = None,
-    top_k: Optional[int] = None,
-    top_p: Optional[float] = None,
-    temperature: Optional[float] = None,
-    prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
-    update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
-    **model_kwargs,
-) -> torch.Tensor:
-    if input_ids.size(1) >= max_length:
-        return input_ids
-
-    logits_processor = _prepare_logits_processor(top_k, top_p, temperature)
-    unfinished_sequences = input_ids.new(input_ids.shape[0]).fill_(1)
-
-    for _ in range(input_ids.size(1), max_length):
-        model_inputs = (
-            prepare_inputs_fn(input_ids, **model_kwargs) if prepare_inputs_fn is not None else {"input_ids": input_ids}
-        )
-        outputs = model(**model_inputs)
-
-        # NOTE: this is correct only in left padding mode
-        next_token_logits = outputs["logits"][:, -1, :]
-        next_token_logits = logits_processor(input_ids, next_token_logits)
-        # sample
-        probs = torch.softmax(next_token_logits, dim=-1, dtype=torch.float)
-        next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
-
-        # finished sentences should have their next token be a padding token
-        if eos_token_id is not None:
-            assert pad_token_id is not None, "If `eos_token_id` is defined, make sure that `pad_token_id` is defined."
-            next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
-
-        # update generated ids, model inputs for next step
-        input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
-        if update_model_kwargs_fn is not None:
-            model_kwargs = update_model_kwargs_fn(outputs, model_kwargs)
-
-        # if eos_token was found in one sentence, set sentence to finished
-        if eos_token_id is not None:
-            unfinished_sequences = unfinished_sequences.mul((next_tokens != eos_token_id).long())
-
-        # stop when each sentence is finished if early_stopping=True
-        if early_stopping and _is_sequence_finished(unfinished_sequences):
-            break
-
-    return input_ids
-
-
-@torch.no_grad()
-def generate(
-    model: Actor,
-    input_ids: torch.Tensor,
-    tokenizer: PreTrainedTokenizer,
-    max_length: int,
-    num_beams: int = 1,
-    do_sample: bool = True,
-    early_stopping: bool = False,
-    top_k: Optional[int] = None,
-    top_p: Optional[float] = None,
-    temperature: Optional[float] = None,
-    prepare_inputs_fn: Optional[Callable[[torch.Tensor, Any], dict]] = None,
-    update_model_kwargs_fn: Optional[Callable[[dict, Any], dict]] = None,
-    **model_kwargs,
-) -> torch.Tensor:
-    """Generate token sequence. The returned sequence is input_ids + generated_tokens.
-
-    Args:
-        model (nn.Module): model
-        input_ids (torch.Tensor): input sequence
-        max_length (int): max length of the returned sequence
-        num_beams (int, optional): number of beams. Defaults to 1.
-        do_sample (bool, optional): whether to do sample. Defaults to True.
-        early_stopping (bool, optional): if True, the sequence length may be smaller than max_length due to finding eos. Defaults to False.
-        top_k (Optional[int], optional): the number of highest probability vocabulary tokens to keep for top-k-filtering. Defaults to None.
-        top_p (Optional[float], optional): If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation. Defaults to None.
-        temperature (Optional[float], optional): The value used to module the next token probabilities. Defaults to None.
-        prepare_inputs_fn (Optional[Callable[[torch.Tensor, Any], dict]], optional): Function to preprocess model inputs. Arguments of this function should be input_ids and model_kwargs. Defaults to None.
-        update_model_kwargs_fn (Optional[Callable[[dict, Any], dict]], optional): Function to update model_kwargs based on outputs. Arguments of this function should be outputs and model_kwargs. Defaults to None.
-    """
-    assert tokenizer.padding_side == "left", "Current generation only supports left padding."
-    is_greedy_gen_mode = (num_beams == 1) and do_sample is False
-    is_sample_gen_mode = (num_beams == 1) and do_sample is True
-    is_beam_gen_mode = (num_beams > 1) and do_sample is False
-    if is_greedy_gen_mode:
-        # run greedy search
-        raise NotImplementedError
-    elif is_sample_gen_mode:
-        # run sample
-        return _sample(
-            model,
-            input_ids,
-            max_length,
-            early_stopping=early_stopping,
-            eos_token_id=tokenizer.eos_token_id,
-            pad_token_id=tokenizer.pad_token_id,
-            top_k=top_k,
-            top_p=top_p,
-            temperature=temperature,
-            prepare_inputs_fn=prepare_inputs_fn,
-            update_model_kwargs_fn=update_model_kwargs_fn,
-            **model_kwargs,
-        )
-    elif is_beam_gen_mode:
-        raise NotImplementedError
-    else:
-        raise ValueError("Unsupported generation mode")
--- a/applications/Chat/coati/models/gpt/init.py
+++ b/applications/Chat/coati/models/gpt/init.py
@ -1,5 +0,0 @@
-from .gpt_actor import GPTActor
-from .gpt_critic import GPTCritic
-from .gpt_rm import GPTRM
-
-__all__ = ["GPTActor", "GPTCritic", "GPTRM"]
--- a/applications/Chat/coati/models/gpt/gpt_actor.py
+++ b/applications/Chat/coati/models/gpt/gpt_actor.py
@ -1,38 +0,0 @@
-from typing import Optional
-
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2LMHeadModel
-
-from ..base import Actor
-
-
-class GPTActor(Actor):
-    """
-    GPT Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (GPT2Config): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): Rank of the LoRa layer.
-        lora_train_bias (str): Bias training strategy for the LoRa layer.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[GPT2Config] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = GPT2LMHeadModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = GPT2LMHeadModel(config)
-        else:
-            model = GPT2LMHeadModel(GPT2Config())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias, **kwargs)
--- a/applications/Chat/coati/models/gpt/gpt_critic.py
+++ b/applications/Chat/coati/models/gpt/gpt_critic.py
@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2Model
-
-from ..base import Critic
-
-
-class GPTCritic(Critic):
-    """
-    GPT Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (GPT2Config): Model config.
-        lora_rank (int): Rank of the LO-RA decomposition.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[GPT2Config] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = GPT2Model.from_pretrained(pretrained)
-        elif config is not None:
-            model = GPT2Model(config)
-        else:
-            model = GPT2Model(GPT2Config())
-
-        value_head = nn.Linear(model.config.n_embd, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
--- a/applications/Chat/coati/models/gpt/gpt_rm.py
+++ b/applications/Chat/coati/models/gpt/gpt_rm.py
@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers.models.gpt2.configuration_gpt2 import GPT2Config
-from transformers.models.gpt2.modeling_gpt2 import GPT2Model
-
-from ..base import RewardModel
-
-
-class GPTRM(RewardModel):
-    """
-    GPT Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (GPT2Config): Model config.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[GPT2Config] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = GPT2Model.from_pretrained(pretrained)
-        elif config is not None:
-            model = GPT2Model(config)
-        else:
-            model = GPT2Model(GPT2Config())
-
-        value_head = nn.Linear(model.config.n_embd, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.n_embd + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
--- a/applications/Chat/coati/models/llama/init.py
+++ b/applications/Chat/coati/models/llama/init.py
@ -1,5 +0,0 @@
-from .llama_actor import LlamaActor
-from .llama_critic import LlamaCritic
-from .llama_rm import LlamaRM
-
-__all__ = ["LlamaActor", "LlamaCritic", "LlamaRM"]
--- a/applications/Chat/coati/models/llama/llama_actor.py
+++ b/applications/Chat/coati/models/llama/llama_actor.py
@ -1,38 +0,0 @@
-from typing import Optional
-
-from transformers import LlamaConfig, LlamaForCausalLM
-
-from ..base import Actor
-
-
-class LlamaActor(Actor):
-    """
-    Llama Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (LlamaConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[LlamaConfig] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = LlamaForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = LlamaForCausalLM(config)
-        else:
-            model = LlamaForCausalLM(LlamaConfig())
-
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-
-        super().__init__(model, lora_rank, lora_train_bias)
--- a/applications/Chat/coati/models/llama/llama_critic.py
+++ b/applications/Chat/coati/models/llama/llama_critic.py
@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import LlamaConfig, LlamaModel
-
-from ..base import Critic
-
-
-class LlamaCritic(Critic):
-    """
-    Llama Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (LlamaConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[LlamaConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = LlamaModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = LlamaModel(config)
-        else:
-            model = LlamaModel(LlamaConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
--- a/applications/Chat/coati/models/llama/llama_rm.py
+++ b/applications/Chat/coati/models/llama/llama_rm.py
@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import LlamaConfig, LlamaModel
-
-from ..base import RewardModel
-
-
-class LlamaRM(RewardModel):
-    """
-    Llama Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (LlamaConfig): Model config.
-        lora_rank (int): LoRA rank.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[LlamaConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = LlamaModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = LlamaModel(config)
-        else:
-            model = LlamaModel(LlamaConfig())
-
-        value_head = nn.Linear(model.config.hidden_size, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.hidden_size + 1))
-
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
--- a/applications/Chat/coati/models/lora.py
+++ b/applications/Chat/coati/models/lora.py
@ -1,153 +0,0 @@
-import dataclasses
-import math
-import warnings
-from typing import Optional
-
-import loralib as lora
-import torch
-import torch.nn as nn
-import torch.nn.functional as F
-
-
-@dataclasses.dataclass
-class LoRAManager:
-    merge_weights: bool = False
-
-
-LORA_MANAGER = LoRAManager()
-
-
-class LoraLinear(lora.LoRALayer, nn.Module):
-    """Replace in-place ops to out-of-place ops to fit gemini. Convert a torch.nn.Linear to LoraLinear."""
-
-    def __init__(
-        self,
-        weight: nn.Parameter,
-        bias: Optional[nn.Parameter],
-        r: int = 0,
-        lora_alpha: int = 1,
-        lora_dropout: float = 0.0,
-        # Set this to True if the layer to replace stores weight like (fan_in, fan_out)
-        fan_in_fan_out: bool = False,
-    ):
-        nn.Module.__init__(self)
-        lora.LoRALayer.__init__(self, r=r, lora_alpha=lora_alpha, lora_dropout=lora_dropout, merge_weights=False)
-        self.weight = weight
-        self.bias = bias
-
-        out_features, in_features = weight.shape
-        self.in_features = in_features
-        self.out_features = out_features
-
-        self.fan_in_fan_out = fan_in_fan_out
-        # Actual trainable parameters
-        if r > 0:
-            self.lora_A = nn.Parameter(self.weight.new_zeros((r, in_features)))
-            self.lora_B = nn.Parameter(self.weight.new_zeros((out_features, r)))
-            self.scaling = self.lora_alpha / self.r
-            # Freezing the pre-trained weight matrix
-            self.weight.requires_grad = False
-        self.reset_parameters()
-        if fan_in_fan_out:
-            self.weight.data = self.weight.data.T
-
-    def reset_parameters(self):
-        if hasattr(self, "lora_A"):
-            # Initialize A with the default values for nn.Linear and set B to zero.
-            nn.init.kaiming_uniform_(self.lora_A, a=math.sqrt(5))
-            nn.init.zeros_(self.lora_B)
-
-    def train(self, mode: bool = True):
-        def T(w):
-            return w.T if self.fan_in_fan_out else w
-
-        self.training = mode
-        if LORA_MANAGER.merge_weights:
-            if mode and self.merged:
-                warnings.warn("Invoke module.train() would unmerge LoRA weights.")
-                raise NotImplementedError("LoRA unmerge is not tested.")
-                # Make sure that the weights are not merged
-                if self.r > 0:
-                    if not hasattr(self, "lora_A") or not hasattr(self, "lora_B"):
-                        # FIXME(csric): temporary fix
-                        self.lora_A = nn.Parameter(self.weight.new_empty((self.r, self.in_features)))
-                        self.lora_B = nn.Parameter(self.weight.new_empty((self.out_features, self.r)))
-                        self.reset_parameters()
-                    else:
-                        self.weight.data -= T(self.lora_B @ self.lora_A) * self.scaling
-                self.merged = False
-            elif not mode and not self.merged:
-                warnings.warn("Invoke module.eval() would merge LoRA weights.")
-                # Merge the weights and mark it
-                if self.r > 0:
-                    self.weight.data += T(self.lora_B @ self.lora_A) * self.scaling
-                    delattr(self, "lora_A")
-                    delattr(self, "lora_B")
-                self.merged = True
-
-        return self
-
-    def forward(self, x: torch.Tensor):
-        def T(w):
-            return w.T if self.fan_in_fan_out else w
-
-        if self.r > 0 and not self.merged:
-            result = F.linear(x, T(self.weight), bias=self.bias)
-            if self.r > 0:
-                result = result + (self.lora_dropout(x) @ self.lora_A.t() @ self.lora_B.t()) * self.scaling
-            return result
-        else:
-            return F.linear(x, T(self.weight), bias=self.bias)
-
-
-def _lora_linear_wrapper(linear: nn.Linear, lora_rank: int) -> LoraLinear:
-    assert (
-        lora_rank <= linear.in_features
-    ), f"LoRA rank ({lora_rank}) must be less than or equal to in features ({linear.in_features})"
-    lora_linear = LoraLinear(linear.weight, linear.bias, r=lora_rank)
-    return lora_linear
-
-
-def _convert_to_lora_recursively(module: nn.Module, lora_rank: int) -> None:
-    for name, child in module.named_children():
-        if isinstance(child, nn.Linear):
-            setattr(module, name, _lora_linear_wrapper(child, lora_rank))
-        else:
-            _convert_to_lora_recursively(child, lora_rank)
-
-
-def convert_to_lora_module(module: nn.Module, lora_rank: int, lora_train_bias: str = "none") -> nn.Module:
-    """Convert a torch.nn.Module to a LoRA module.
-
-    Args:
-        module (nn.Module): The module to convert.
-        lora_rank (int): LoRA rank.
-
-    Returns:
-        nn.Module: The converted module.
-    """
-    if lora_rank <= 0:
-        return module
-    _convert_to_lora_recursively(module, lora_rank)
-    lora.mark_only_lora_as_trainable(module, lora_train_bias)
-    return module
-
-
-class LoRAModule(nn.Module):
-    """A LoRA module base class. All derived classes should call `convert_to_lora()` at the bottom of `__init__()`.
-    This class will convert all torch.nn.Linear layer to LoraLinear layer.
-
-    Args:
-        lora_rank (int, optional): LoRA rank. 0 means LoRA is not applied. Defaults to 0.
-        lora_train_bias (str, optional): Whether LoRA train biases.
-            'none' means it doesn't train biases. 'all' means it trains all biases. 'lora_only' means it only trains biases of LoRA layers.
-            Defaults to 'none'.
-    """
-
-    def __init__(self, lora_rank: int = 0, lora_train_bias: str = "none") -> None:
-        super().__init__()
-        self.lora_rank = lora_rank
-        self.lora_train_bias = lora_train_bias
-
-    def convert_to_lora(self) -> None:
-        convert_to_lora_module(self, self.lora_rank, self.lora_train_bias)
--- a/applications/Chat/coati/models/loss.py
+++ b/applications/Chat/coati/models/loss.py
@ -1,97 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.nn as nn
-
-from .utils import masked_mean
-
-
-class GPTLMLoss(nn.Module):
-    """
-    GPT Language Model Loss
-    """
-
-    def __init__(self):
-        super().__init__()
-        # NOTE: default ignore_index is -100, which is equal to IGNORE_INDEX in sft_dataset.py
-        self.loss = nn.CrossEntropyLoss()
-
-    def forward(self, logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-        shift_logits = logits[..., :-1, :].contiguous()
-        shift_labels = labels[..., 1:].contiguous()
-        # Flatten the tokens
-        return self.loss(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
-
-
-class PolicyLoss(nn.Module):
-    """
-    Policy Loss for PPO
-    """
-
-    def __init__(self, clip_eps: float = 0.2) -> None:
-        super().__init__()
-        self.clip_eps = clip_eps
-
-    def forward(
-        self,
-        log_probs: torch.Tensor,
-        old_log_probs: torch.Tensor,
-        advantages: torch.Tensor,
-        action_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        ratio = (log_probs - old_log_probs).exp()
-        surr1 = ratio * advantages
-        surr2 = ratio.clamp(1 - self.clip_eps, 1 + self.clip_eps) * advantages
-        loss = -torch.min(surr1, surr2)
-        if action_mask is not None:
-            loss = masked_mean(loss, action_mask)
-        loss = loss.mean()
-        return loss
-
-
-class ValueLoss(nn.Module):
-    """
-    Value Loss for PPO
-    """
-
-    def __init__(self, clip_eps: float = 0.4) -> None:
-        super().__init__()
-        self.clip_eps = clip_eps
-
-    def forward(
-        self,
-        values: torch.Tensor,
-        old_values: torch.Tensor,
-        reward: torch.Tensor,
-        action_mask: Optional[torch.Tensor] = None,
-    ) -> torch.Tensor:
-        values_clipped = old_values + (values - old_values).clamp(-self.clip_eps, self.clip_eps)
-        surr1 = (values_clipped - reward) ** 2
-        surr2 = (values - reward) ** 2
-        loss = torch.max(surr1, surr2)
-        loss = loss.mean()
-        return 0.5 * loss
-
-
-class LogSigLoss(nn.Module):
-    """
-    Pairwise Loss for Reward Model
-    Details: https://arxiv.org/abs/2203.02155
-    """
-
-    def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor:
-        probs = torch.sigmoid(chosen_reward - reject_reward)
-        log_probs = torch.log(probs)
-        loss = -log_probs.mean()
-        return loss
-
-
-class LogExpLoss(nn.Module):
-    """
-    Pairwise Loss for Reward Model
-    Details: https://arxiv.org/abs/2204.05862
-    """
-
-    def forward(self, chosen_reward: torch.Tensor, reject_reward: torch.Tensor) -> torch.Tensor:
-        loss = torch.log(1 + torch.exp(reject_reward - chosen_reward)).mean()
-        return loss
--- a/applications/Chat/coati/models/opt/init.py
+++ b/applications/Chat/coati/models/opt/init.py
@ -1,5 +0,0 @@
-from .opt_actor import OPTActor
-from .opt_critic import OPTCritic
-from .opt_rm import OPTRM
-
-__all__ = ["OPTActor", "OPTCritic", "OPTRM"]
--- a/applications/Chat/coati/models/opt/opt_actor.py
+++ b/applications/Chat/coati/models/opt/opt_actor.py
@ -1,37 +0,0 @@
-from typing import Optional
-
-from transformers.models.opt.configuration_opt import OPTConfig
-from transformers.models.opt.modeling_opt import OPTForCausalLM
-
-from ..base import Actor
-
-
-class OPTActor(Actor):
-    """
-    OPT Actor model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (OPTConfig): Model config.
-        checkpoint (bool): Enable gradient checkpointing.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[OPTConfig] = None,
-        checkpoint: bool = False,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = OPTForCausalLM.from_pretrained(pretrained)
-        elif config is not None:
-            model = OPTForCausalLM(config)
-        else:
-            model = OPTForCausalLM(OPTConfig())
-        if checkpoint:
-            model.gradient_checkpointing_enable()
-        super().__init__(model, lora_rank, lora_train_bias)
--- a/applications/Chat/coati/models/opt/opt_critic.py
+++ b/applications/Chat/coati/models/opt/opt_critic.py
@ -1,37 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers.models.opt.configuration_opt import OPTConfig
-from transformers.models.opt.modeling_opt import OPTModel
-
-from ..base import Critic
-
-
-class OPTCritic(Critic):
-    """
-    OPT Critic model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (OPTConfig): Model config.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[OPTConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-        **kwargs,
-    ) -> None:
-        if pretrained is not None:
-            model = OPTModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = OPTModel(config)
-        else:
-            model = OPTModel(OPTConfig())
-
-        value_head = nn.Linear(model.config.word_embed_proj_dim, 1)
-        super().__init__(model, value_head, lora_rank, lora_train_bias, **kwargs)
--- a/applications/Chat/coati/models/opt/opt_rm.py
+++ b/applications/Chat/coati/models/opt/opt_rm.py
@ -1,36 +0,0 @@
-from typing import Optional
-
-import torch.nn as nn
-from transformers import OPTConfig, OPTModel
-
-from ..base import RewardModel
-
-
-class OPTRM(RewardModel):
-    """
-    OPT Reward model.
-
-    Args:
-        pretrained (str): Pretrained model name or path.
-        config (OPTConfig): Model config.
-        lora_rank (int): Rank of the low-rank approximation.
-        lora_train_bias (str): LoRA bias training mode.
-    """
-
-    def __init__(
-        self,
-        pretrained: Optional[str] = None,
-        config: Optional[OPTConfig] = None,
-        lora_rank: int = 0,
-        lora_train_bias: str = "none",
-    ) -> None:
-        if pretrained is not None:
-            model = OPTModel.from_pretrained(pretrained)
-        elif config is not None:
-            model = OPTModel(config)
-        else:
-            model = OPTModel(OPTConfig())
-
-        value_head = nn.Linear(model.config.word_embed_proj_dim, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.word_embed_proj_dim + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
--- a/applications/Chat/coati/models/utils.py
+++ b/applications/Chat/coati/models/utils.py
@ -1,69 +0,0 @@
-from typing import Optional, Union
-
-import torch
-import torch.nn.functional as F
-
-
-def _compute_approx_kl(
-    log_probs: torch.Tensor, log_probs_base: torch.Tensor, action_mask: Optional[torch.Tensor] = None
-) -> torch.Tensor:
-    """
-    Compute the approximate KL divergence between two distributions.
-    Schulman blog: http://joschu.net/blog/kl-approx.html
-
-    Args:
-        log_probs: Log probabilities of the new distribution.
-        log_probs_base: Log probabilities of the base distribution.
-        action_mask: Mask for actions.
-    """
-
-    log_ratio = log_probs_base - log_probs
-    approx_kl = (log_ratio.exp() - 1) - log_ratio
-    if action_mask is not None:
-        approx_kl = masked_mean(approx_kl, action_mask, dim=1)
-        return approx_kl
-    approx_kl = approx_kl.mean(dim=1)
-    return approx_kl
-
-
-def compute_reward(
-    r: Union[torch.Tensor, float],
-    kl_coef: float,
-    log_probs: torch.Tensor,
-    log_probs_base: torch.Tensor,
-    action_mask: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
-    if kl_coef <= 0.0:
-        return r
-    kl = _compute_approx_kl(log_probs, log_probs_base, action_mask=action_mask)
-    reward = r - kl_coef * kl
-    return reward
-
-
-def _log_probs_from_logits(logits: torch.Tensor, labels: torch.Tensor) -> torch.Tensor:
-    log_probs = F.log_softmax(logits, dim=-1)
-    log_probs_labels = log_probs.gather(dim=-1, index=labels.unsqueeze(-1))
-    return log_probs_labels.squeeze(-1)
-
-
-def calc_action_log_probs(logits: torch.Tensor, sequences: torch.LongTensor, num_actions: int) -> torch.Tensor:
-    """Calculate action log probs.
-
-    Args:
-        output (torch.Tensor): Output tensor of Actor.forward.logits.
-        sequences (torch.LongTensor): Input sequences.
-        num_actions (int): Number of actions.
-
-    Returns:
-        torch.Tensor: Action log probs.
-    """
-    log_probs = _log_probs_from_logits(logits[:, :-1, :], sequences[:, 1:])
-    return log_probs[:, -num_actions:]
-
-
-def masked_mean(tensor: torch.Tensor, mask: torch.Tensor, dim: int = 1) -> torch.Tensor:
-    tensor = tensor * mask
-    tensor = tensor.sum(dim=dim)
-    mask_sum = mask.sum(dim=dim)
-    mean = tensor / (mask_sum + 1e-8)
-    return mean
--- a/applications/Chat/coati/trainer/init.py
+++ b/applications/Chat/coati/trainer/init.py
@ -1,6 +0,0 @@
-from .base import OnPolicyTrainer, SLTrainer
-from .ppo import PPOTrainer
-from .rm import RewardModelTrainer
-from .sft import SFTTrainer
-
-__all__ = ["SLTrainer", "OnPolicyTrainer", "RewardModelTrainer", "SFTTrainer", "PPOTrainer"]
--- a/applications/Chat/coati/trainer/callbacks/init.py
+++ b/applications/Chat/coati/trainer/callbacks/init.py
@ -1,5 +0,0 @@
-from .base import Callback
-from .performance_evaluator import PerformanceEvaluator
-from .save_checkpoint import SaveCheckpoint
-
-__all__ = ["Callback", "PerformanceEvaluator", "SaveCheckpoint"]
--- a/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
+++ b/applications/Chat/coati/trainer/callbacks/save_checkpoint.py
@ -1,76 +0,0 @@
-import os
-
-import torch.distributed as dist
-from coati.trainer.strategies import GeminiStrategy, LowLevelZeroStrategy, Strategy
-from coati.trainer.utils import is_rank_0
-from torch import nn
-from torch.optim import Optimizer
-
-from .base import Callback
-
-
-class SaveCheckpoint(Callback):
-    """
-        The callback for saving checkpoint for coati.
-
-        Only support saving actor and critic model.
-        A typical architecture of the saved checkpoint would be:
-            - checkpoint
-                - episode_x
-                    - actor.pt
-                    - actor-optim-rank-0.pt
-                    - actor-optim-rank-1.pt
-                    - critic.pt
-                    - critic-optim-rank-0.pt
-                    - critic-optim-rank-1.pt
-                - ...
-
-    Args:
-        path(str): the base path you want to save checkpoint, the checkpoint would be saved at `path/checkpoint`
-        interval(int): the interval episode of saving checkpoint
-        strategy(Strategy): the strategy used to train
-        actor(nn.Module): the actor model
-        critic(nn.Module): the critic model
-        actor_optim(Optimizer): the optimizer of actor
-        critic_optim(Optimizer): the optimizer of critic
-
-    """
-
-    def __init__(
-        self,
-        path: str,
-        interval: int,
-        strategy: Strategy,
-        actor: nn.Module = None,
-        critic: nn.Module = None,
-        actor_optim: Optimizer = None,
-        critic_optim: Optimizer = None,
-    ) -> None:
-        super().__init__()
-        self.path = os.path.join(path, "checkpoint")
-        self.interval = interval
-        self.strategy = strategy
-        self.model_dict = {"actor": [actor, actor_optim], "critic": [critic, critic_optim]}
-
-    def on_episode_end(self, episode: int) -> None:
-        if (episode + 1) % self.interval != 0:
-            return
-        base_path = os.path.join(self.path, f"episode_{episode}")
-        if not os.path.exists(base_path):
-            os.makedirs(base_path)
-
-        for model in self.model_dict.keys():
-            # save model
-            if self.model_dict[model][0] is None:
-                # saving only optimizer states is meaningless, so it would be skipped
-                continue
-            model_path = os.path.join(base_path, f"{model}.pt")
-            self.strategy.save_model(model=self.model_dict[model][0], path=model_path, only_rank0=True)
-
-            # save optimizer
-            if self.model_dict[model][1] is None:
-                continue
-            only_rank0 = not isinstance(self.strategy, (LowLevelZeroStrategy, GeminiStrategy))
-            rank = 0 if is_rank_0() else dist.get_rank()
-            optim_path = os.path.join(base_path, f"{model}-optim-rank-{rank}.pt")
-            self.strategy.save_optimizer(optimizer=self.model_dict[model][1], path=optim_path, only_rank0=only_rank0)
--- a/applications/Chat/coati/trainer/ppo.py
+++ b/applications/Chat/coati/trainer/ppo.py
@ -1,202 +0,0 @@
-from typing import Dict, List, Optional
-
-from coati.experience_buffer import NaiveExperienceBuffer
-from coati.experience_maker import Experience, NaiveExperienceMaker
-from coati.models.base import Actor, Critic, RewardModel, get_base_model
-from coati.models.loss import GPTLMLoss, PolicyLoss, ValueLoss
-from coati.models.utils import calc_action_log_probs
-from torch.optim import Optimizer
-from torch.utils.data import DataLoader, DistributedSampler
-from tqdm import tqdm
-from transformers import PreTrainedTokenizerBase
-
-from colossalai.utils import get_current_device
-
-from .base import OnPolicyTrainer
-from .callbacks import Callback
-from .strategies import GeminiStrategy, Strategy
-from .utils import CycledDataLoader, is_rank_0, to_device
-
-
-def _set_default_generate_kwargs(strategy: Strategy, generate_kwargs: dict, actor: Actor) -> Dict:
-    unwrapped_model = strategy.unwrap_model(actor)
-    hf_model = get_base_model(unwrapped_model)
-    new_kwargs = {**generate_kwargs}
-    # use huggingface models method directly
-    if "prepare_inputs_fn" not in generate_kwargs and hasattr(hf_model, "prepare_inputs_for_generation"):
-        new_kwargs["prepare_inputs_fn"] = hf_model.prepare_inputs_for_generation
-
-    if "update_model_kwargs_fn" not in generate_kwargs and hasattr(hf_model, "_update_model_kwargs_for_generation"):
-        new_kwargs["update_model_kwargs_fn"] = hf_model._update_model_kwargs_for_generation
-
-    return new_kwargs
-
-
-class PPOTrainer(OnPolicyTrainer):
-    """
-        Trainer for PPO algorithm.
-
-    Args:
-        strategy (Strategy): the strategy to use for training
-        actor (Actor): the actor model in ppo algorithm
-        critic (Critic): the critic model in ppo algorithm
-        reward_model (RewardModel): the reward model in rlhf algorithm to make reward of sentences
-        initial_model (Actor): the initial model in rlhf algorithm to generate reference logics to limit the update of actor
-        actor_optim (Optimizer): the optimizer to use for actor model
-        critic_optim (Optimizer): the optimizer to use for critic model
-        kl_coef (float, defaults to 0.1): the coefficient of kl divergence loss
-        train_batch_size (int, defaults to 8): the batch size to use for training
-        buffer_limit (int, defaults to 0): the max_size limitation of buffer
-        buffer_cpu_offload (bool, defaults to True): whether to offload buffer to cpu
-        eps_clip (float, defaults to 0.2): the clip coefficient of policy loss
-        vf_coef (float, defaults to 1.0): the coefficient of value loss
-        ptx_coef (float, defaults to 0.9): the coefficient of ptx loss
-        value_clip (float, defaults to 0.4): the clip coefficient of value loss
-        sample_buffer (bool, defaults to False): whether to sample from buffer
-        dataloader_pin_memory (bool, defaults to True): whether to pin memory for data loader
-        offload_inference_models (bool, defaults to True): whether to offload inference models to cpu during training process
-        callbacks (List[Callback], defaults to []): the callbacks to call during training process
-        generate_kwargs (dict, optional): the kwargs to use while model generating
-    """
-
-    def __init__(
-        self,
-        strategy: Strategy,
-        actor: Actor,
-        critic: Critic,
-        reward_model: RewardModel,
-        initial_model: Actor,
-        actor_optim: Optimizer,
-        critic_optim: Optimizer,
-        tokenizer: PreTrainedTokenizerBase,
-        kl_coef: float = 0.1,
-        ptx_coef: float = 0.9,
-        train_batch_size: int = 8,
-        buffer_limit: int = 0,
-        buffer_cpu_offload: bool = True,
-        eps_clip: float = 0.2,
-        vf_coef: float = 1.0,
-        value_clip: float = 0.4,
-        sample_buffer: bool = False,
-        dataloader_pin_memory: bool = True,
-        offload_inference_models: bool = True,
-        callbacks: List[Callback] = [],
-        **generate_kwargs,
-    ) -> None:
-        if isinstance(strategy, GeminiStrategy):
-            assert not offload_inference_models, "GeminiPlugin is not compatible with manual model.to('cpu')"
-
-        data_buffer = NaiveExperienceBuffer(train_batch_size, buffer_limit, buffer_cpu_offload)
-        super().__init__(strategy, data_buffer, sample_buffer, dataloader_pin_memory, callbacks)
-
-        self.generate_kwargs = _set_default_generate_kwargs(strategy, generate_kwargs, actor)
-        self.experience_maker = NaiveExperienceMaker(actor, critic, reward_model, initial_model, tokenizer, kl_coef)
-
-        self.actor = actor
-        self.critic = critic
-        self.tokenizer = tokenizer
-
-        self.actor_loss_fn = PolicyLoss(eps_clip)
-        self.critic_loss_fn = ValueLoss(value_clip)
-        self.vf_coef = vf_coef
-        self.ptx_loss_fn = GPTLMLoss()
-        self.ptx_coef = ptx_coef
-        self.actor_optim = actor_optim
-        self.critic_optim = critic_optim
-
-        self.offload_inference_models = offload_inference_models
-        self.device = get_current_device()
-
-    def _before_fit(
-        self,
-        prompt_dataloader: DataLoader,
-        pretrain_dataloader: DataLoader,
-        log_dir: Optional[str] = None,
-        use_wandb: bool = False,
-    ):
-        """
-        Args:
-            prompt_dataloader (DataLoader): the dataloader to use for prompt data
-            pretrain_dataloader (DataLoader): the dataloader to use for pretrain data
-        """
-        self.prompt_dataloader = CycledDataLoader(prompt_dataloader)
-        self.pretrain_dataloader = CycledDataLoader(pretrain_dataloader)
-
-        self.writer = None
-        if use_wandb and is_rank_0():
-            assert log_dir is not None, "log_dir must be provided when use_wandb is True"
-            import wandb
-
-            wandb.init(project="Coati-ppo", sync_tensorboard=True)
-        if log_dir is not None and is_rank_0():
-            import os
-            import time
-
-            from torch.utils.tensorboard import SummaryWriter
-
-            log_dir = os.path.join(log_dir, "ppo")
-            log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
-            self.writer = SummaryWriter(log_dir=log_dir)
-
-    def _make_experience(self, collect_step: int) -> Experience:
-        prompts = self.prompt_dataloader.next()
-        if self.offload_inference_models:
-            # TODO(ver217): this may be controlled by strategy if they are prepared by strategy
-            self.experience_maker.initial_model.to(self.device)
-            self.experience_maker.reward_model.to(self.device)
-        assert isinstance(prompts, dict), f'Unsupported input type "{type(prompts)}"'
-        return self.experience_maker.make_experience(**prompts, **self.generate_kwargs)
-
-    def _training_step(self, experience: Experience):
-        self.actor.train()
-        self.critic.train()
-        # policy loss
-        num_actions = experience.action_log_probs.size(1)
-        actor_logits = self.actor(experience.sequences, experience.attention_mask)["logits"]
-        action_log_probs = calc_action_log_probs(actor_logits, experience.sequences, num_actions)
-        actor_loss = self.actor_loss_fn(
-            action_log_probs, experience.action_log_probs, experience.advantages, action_mask=experience.action_mask
-        )
-        actor_loss = (1 - self.ptx_coef) * actor_loss
-        self.strategy.backward(actor_loss, self.actor, self.actor_optim)
-
-        # ptx loss
-        if self.ptx_coef != 0:
-            batch = self.pretrain_dataloader.next()
-            batch = to_device(batch, self.device)
-            ptx_log_probs = self.actor(batch["input_ids"], batch["attention_mask"])["logits"]
-            ptx_loss = self.ptx_coef * self.ptx_loss_fn(ptx_log_probs, batch["labels"])
-            self.strategy.backward(ptx_loss, self.actor, self.actor_optim)
-
-        self.strategy.optimizer_step(self.actor_optim)
-        self.actor_optim.zero_grad()
-
-        # value loss
-        values = self.critic(experience.sequences, attention_mask=experience.attention_mask)
-        critic_loss = self.critic_loss_fn(values, experience.values, experience.reward)
-        critic_loss = critic_loss * self.vf_coef
-        self.strategy.backward(critic_loss, self.critic, self.critic_optim)
-        self.strategy.optimizer_step(self.critic_optim)
-        self.critic_optim.zero_grad()
-
-    def _learn(self, update_step: int):
-        if self.offload_inference_models:
-            self.experience_maker.initial_model.to("cpu")
-            self.experience_maker.reward_model.to("cpu")
-
-        # buffer may be empty at first, we should rebuild at each training
-        if self.sample_buffer:
-            experience = self.data_buffer.sample()
-            self._on_learn_batch_start()
-            experience.to_device(self.device)
-            self._training_step(experience)
-            self._on_learn_batch_end(experience)
-        else:
-            if isinstance(self.dataloader.sampler, DistributedSampler):
-                self.dataloader.sampler.set_epoch(update_step)
-            pbar = tqdm(self.dataloader, desc=f"Train epoch [{update_step + 1}]", disable=not is_rank_0())
-            for experience in pbar:
-                self._on_learn_batch_start()
-                experience.to_device(self.device)
-                self._training_step(experience)
-                self._on_learn_batch_end(experience)
--- a/applications/Chat/coati/trainer/rm.py
+++ b/applications/Chat/coati/trainer/rm.py
@ -1,123 +0,0 @@
-from typing import Callable, Optional
-
-import torch
-import tqdm
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader
-
-from .base import SLTrainer
-from .strategies import Strategy
-from .utils import is_rank_0
-
-
-class RewardModelTrainer(SLTrainer):
-    """
-        Trainer to use while training reward model.
-
-    Args:
-        model (torch.nn.Module): the model to train
-        strategy (Strategy): the strategy to use for training
-        optim (Optimizer): the optimizer to use for training
-        lr_scheduler (_LRScheduler): the lr scheduler to use for training
-        loss_fn (callable): the loss function to use for training
-        max_epochs (int, defaults to 2): the number of epochs to train
-    """
-
-    def __init__(
-        self,
-        model,
-        strategy: Strategy,
-        optim: Optimizer,
-        lr_scheduler: _LRScheduler,
-        loss_fn: Callable,
-        max_epochs: int = 1,
-    ) -> None:
-        super().__init__(strategy, max_epochs, model, optim)
-
-        self.loss_fn = loss_fn
-        self.scheduler = lr_scheduler
-
-        self.num_train_step = 0
-
-    def _eval(self, epoch):
-        if self.eval_dataloader is not None:
-            self.model.eval()
-            dist, num_correct, num_samples = 0, 0, 0
-            with torch.no_grad():
-                for chosen_ids, c_mask, reject_ids, r_mask in self.eval_dataloader:
-                    chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
-                    c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
-                    reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
-                    r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
-                    chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-                    reject_reward = self.model(reject_ids, attention_mask=r_mask)
-                    num_samples += chosen_ids.size(0)
-                    num_correct += (chosen_reward > reject_reward).sum().item()
-                    dist += (chosen_reward - reject_reward).mean().item()
-                self.dist = dist / len(self.eval_dataloader)
-                self.acc = num_correct / num_samples
-
-            if self.writer:
-                self.writer.add_scalar("eval/dist", self.dist, epoch)
-                self.writer.add_scalar("eval/acc", self.acc, epoch)
-
-    def _train(self, epoch):
-        self.model.train()
-        step_bar = tqdm.trange(
-            len(self.train_dataloader), desc=f"Epoch {epoch + 1}/{self.max_epochs}", disable=not is_rank_0()
-        )
-        for chosen_ids, c_mask, reject_ids, r_mask in self.train_dataloader:
-            chosen_ids = chosen_ids.squeeze(1).to(torch.cuda.current_device())
-            c_mask = c_mask.squeeze(1).to(torch.cuda.current_device())
-            reject_ids = reject_ids.squeeze(1).to(torch.cuda.current_device())
-            r_mask = r_mask.squeeze(1).to(torch.cuda.current_device())
-            chosen_reward = self.model(chosen_ids, attention_mask=c_mask)
-            reject_reward = self.model(reject_ids, attention_mask=r_mask)
-            loss = self.loss_fn(chosen_reward, reject_reward)
-            self.strategy.backward(loss, self.model, self.optimizer)
-            self.strategy.optimizer_step(self.optimizer)
-            self.optimizer.zero_grad()
-            if self.writer:
-                self.writer.add_scalar("train/loss", loss.item(), self.num_train_step)
-                self.writer.add_scalar("train/lr", self.optimizer.param_groups[0]["lr"], self.num_train_step)
-                self.writer.add_scalar("train/dist", (chosen_reward - reject_reward).mean().item(), self.num_train_step)
-                self.writer.add_scalar(
-                    "train/acc", (chosen_reward > reject_reward).float().mean().item(), self.num_train_step
-                )
-            self.num_train_step += 1
-            if self.num_train_step % 100 == 0:
-                self.scheduler.step()
-            step_bar.update()
-        step_bar.close()
-
-    def _before_fit(
-        self,
-        train_dataloader: DataLoader,
-        eval_dataloader: DataLoader,
-        log_dir: Optional[str] = None,
-        use_wandb: bool = False,
-    ):
-        """
-        Args:
-            train_dataloader (DataLoader): the dataloader to use for training
-            eval_dataloader (DataLoader): the dataloader to use for evaluation
-        """
-        self.train_dataloader = train_dataloader
-        self.eval_dataloader = eval_dataloader
-
-        self.writer = None
-        if use_wandb and is_rank_0():
-            assert log_dir is not None, "log_dir must be provided when use_wandb is True"
-            import wandb
-
-            wandb.init(project="Coati-rm", sync_tensorboard=True)
-        if log_dir is not None and is_rank_0():
-            import os
-            import time
-
-            from torch.utils.tensorboard import SummaryWriter
-
-            log_dir = os.path.join(log_dir, "rm")
-            log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
-            self.writer = SummaryWriter(log_dir=log_dir)
--- a/applications/Chat/coati/trainer/sft.py
+++ b/applications/Chat/coati/trainer/sft.py
@ -1,130 +0,0 @@
-from typing import Optional
-
-import torch
-import torch.distributed as dist
-import tqdm
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader
-
-from colossalai.logging import DistributedLogger
-
-from .base import SLTrainer
-from .strategies import GeminiStrategy, Strategy
-from .utils import is_rank_0, to_device
-
-
-class SFTTrainer(SLTrainer):
-    """
-        Trainer to use while training reward model.
-
-    Args:
-        model (torch.nn.Module): the model to train
-        strategy (Strategy): the strategy to use for training
-        optim(Optimizer): the optimizer to use for training
-        lr_scheduler(_LRScheduler): the lr scheduler to use for training
-        max_epochs (int, defaults to 2): the number of epochs to train
-        accumulation_steps (int, defaults to 8): the number of steps to accumulate gradients
-    """
-
-    def __init__(
-        self,
-        model,
-        strategy: Strategy,
-        optim: Optimizer,
-        lr_scheduler: _LRScheduler,
-        max_epochs: int = 2,
-        accumulation_steps: int = 8,
-    ) -> None:
-        if accumulation_steps > 1:
-            assert not isinstance(
-                strategy, GeminiStrategy
-            ), "Accumulation steps are not supported in stage 3 of ColossalAI"
-
-        super().__init__(strategy, max_epochs, model, optim)
-
-        self.accumulation_steps = accumulation_steps
-        self.scheduler = lr_scheduler
-
-        self.num_train_step = 0
-        self.num_eval_step = 0
-
-    def _train(self, epoch: int):
-        self.model.train()
-        step_bar = tqdm.trange(
-            len(self.train_dataloader) // self.accumulation_steps,
-            desc=f"Epoch {epoch + 1}/{self.max_epochs}",
-            disable=not is_rank_0(),
-        )
-        for i, batch in enumerate(self.train_dataloader):
-            batch = to_device(batch, torch.cuda.current_device())
-            outputs = self.model(batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"])
-            loss = outputs.loss / self.accumulation_steps
-            self.total_loss += loss.item()
-            self.strategy.backward(loss, self.model, self.optimizer)
-            # gradient accumulation
-            if (i + 1) % self.accumulation_steps == 0:
-                self.strategy.optimizer_step(self.optimizer)
-                self.optimizer.zero_grad()
-                self.scheduler.step()
-                if self.writer:
-                    self.writer.add_scalar("train/loss", self.total_loss, self.num_train_step)
-                    self.writer.add_scalar("train/lr", self.scheduler.get_last_lr()[0], self.num_train_step)
-                    self.num_train_step += 1
-                self.total_loss = 0
-                step_bar.update()
-        step_bar.close()
-
-    def _eval(self, epoch: int):
-        if self.eval_dataloader is not None:
-            self.model.eval()
-            with torch.no_grad():
-                loss_sum, num_seen = 0, 0
-                for batch in self.eval_dataloader:
-                    batch = to_device(batch, torch.cuda.current_device())
-                    outputs = self.model(
-                        batch["input_ids"], attention_mask=batch["attention_mask"], labels=batch["labels"]
-                    )
-                    loss_sum += outputs.loss.item()
-                    num_seen += batch["input_ids"].size(0)
-                loss_mean = loss_sum / num_seen
-                if dist.get_rank() == 0:
-                    self.logger.info(f"Eval Epoch {epoch}/{self.max_epochs} loss {loss_mean}")
-                if self.writer:
-                    self.writer.add_scalar("eval/loss", loss_mean, self.num_eval_step)
-                    self.num_eval_step += 1
-
-    def _before_fit(
-        self,
-        train_dataloader: DataLoader,
-        eval_dataloader: Optional[DataLoader] = None,
-        logger: Optional[DistributedLogger] = None,
-        log_dir: Optional[str] = None,
-        use_wandb: bool = False,
-    ):
-        """
-        Args:
-            train_dataloader: the dataloader to use for training
-            eval_dataloader: the dataloader to use for evaluation
-        """
-        self.train_dataloader = train_dataloader
-        self.eval_dataloader = eval_dataloader
-
-        self.logger = logger
-        self.writer = None
-        if use_wandb and is_rank_0():
-            assert log_dir is not None, "log_dir must be provided when use_wandb is True"
-            import wandb
-
-            wandb.init(project="Coati-sft", sync_tensorboard=True)
-        if log_dir is not None and is_rank_0():
-            import os
-            import time
-
-            from torch.utils.tensorboard import SummaryWriter
-
-            log_dir = os.path.join(log_dir, "sft")
-            log_dir = os.path.join(log_dir, time.strftime("%Y-%m-%d_%H:%M:%S", time.localtime()))
-            self.writer = SummaryWriter(log_dir=log_dir)
-
-        self.total_loss = 0
--- a/applications/Chat/coati/trainer/strategies/init.py
+++ b/applications/Chat/coati/trainer/strategies/init.py
@ -1,5 +0,0 @@
-from .base import Strategy
-from .colossalai import GeminiStrategy, LowLevelZeroStrategy
-from .ddp import DDPStrategy
-
-__all__ = ["Strategy", "DDPStrategy", "LowLevelZeroStrategy", "GeminiStrategy"]
--- a/applications/Chat/coati/trainer/strategies/base.py
+++ b/applications/Chat/coati/trainer/strategies/base.py
@ -1,137 +0,0 @@
-from abc import ABC, abstractmethod
-from contextlib import nullcontext
-from typing import Callable, Dict, List, Optional, Tuple, Union
-
-import torch
-import torch.nn as nn
-from coati.experience_buffer import ExperienceBuffer
-from torch.optim import Optimizer
-from torch.utils.data import DataLoader
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-
-from colossalai.booster import Booster
-from colossalai.booster.plugin import Plugin
-
-from .sampler import DistributedSampler
-
-_BoostArgSpec = Union[nn.Module, Tuple[nn.Module, Optimizer], Dict]
-
-
-class Strategy(ABC):
-    """
-    Base class for training strategies.
-    """
-
-    def __init__(self, plugin_initializer: Callable[..., Optional[Plugin]] = lambda: None) -> None:
-        super().__init__()
-        # NOTE: dist must be initialized before Booster
-        self.setup_distributed()
-        self.plugin = plugin_initializer()
-        self.booster = Booster(plugin=self.plugin)
-        self._post_init()
-
-    @abstractmethod
-    def _post_init(self) -> None:
-        pass
-
-    def backward(self, loss: torch.Tensor, model: nn.Module, optimizer: Optimizer, **kwargs) -> None:
-        self.booster.backward(loss, optimizer)
-
-    def optimizer_step(self, optimizer: Optimizer, **kwargs) -> None:
-        optimizer.step()
-
-    @abstractmethod
-    def setup_distributed(self) -> None:
-        pass
-
-    @abstractmethod
-    def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
-        pass
-
-    def model_init_context(self):
-        return nullcontext()
-
-    def prepare(self, *boost_args: _BoostArgSpec) -> Union[List[_BoostArgSpec], _BoostArgSpec]:
-        """Prepare [model | (model, optimizer) | Dict] based on each strategy.
-        NOTE: the keys of Dict must be a subset of `self.booster.boost`'s arguments.
-
-        Example::
-            >>> # e.g., include lr_scheduler
-            >>> result_dict = strategy.prepare(dict(model=model, lr_scheduler=lr_scheduler))
-            >>> # when fine-tuning actor and critic
-            >>> (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare((actor, actor_optim), (critic, critic_optim), reward_model, initial_model)
-            >>> # or when training reward model
-            >>> (reward_model, reward_model_optim) = strategy.prepare((reward_model, reward_model_optim))
-            >>> # or just inference
-            >>> actor, critic = strategy.prepare(actor, critic)
-
-        Returns:
-            Union[List[_BoostArgSpec], _BoostArgSpec]: [model | (model, optimizer) | Dict] in the original order.
-        """
-
-        rets = []
-        for arg in boost_args:
-            if isinstance(arg, nn.Module):
-                model, *_ = self.booster.boost(arg)
-                rets.append(model)
-            elif isinstance(arg, tuple):
-                try:
-                    model, optimizer = arg
-                except ValueError:
-                    raise RuntimeError(f'Expect (model, optimizer) pair, got a tuple with size "{len(arg)}"')
-                model, optimizer, *_ = self.booster.boost(model=model, optimizer=optimizer)
-                rets.append((model, optimizer))
-            elif isinstance(arg, Dict):
-                model, optimizer, criterion, dataloader, lr_scheduler = self.booster.boost(**arg)
-                boost_result = dict(
-                    model=model,
-                    optimizer=optimizer,
-                    criterion=criterion,
-                    dataloader=dataloader,
-                    lr_scheduler=lr_scheduler,
-                )
-                # remove None values
-                boost_result = {key: value for key, value in boost_result.items() if value is not None}
-                rets.append(boost_result)
-            else:
-                raise RuntimeError(f"Type {type(arg)} is not supported")
-
-        return rets[0] if len(rets) == 1 else rets
-
-    @staticmethod
-    def unwrap_model(model: nn.Module) -> nn.Module:
-        """Get the unwrapped model from a wrapped model made by Strategy.prepare.
-
-        Args:
-            model (nn.Module): the model to unwrap
-
-        Returns:
-            nn.Module: the original model
-        """
-        return model
-
-    def save_model(self, model: nn.Module, path: str, shard: bool = False, **kwargs) -> None:
-        self.booster.save_model(model, path, shard=shard, **kwargs)
-
-    def load_model(self, model: nn.Module, path: str, strict: bool = True) -> None:
-        self.booster.load_model(model, path, strict)
-
-    def save_optimizer(self, optimizer: Optimizer, path: str, only_rank0: bool = False, **kwargs) -> None:
-        self.booster.save_optimizer(optimizer, path, shard=not only_rank0, **kwargs)
-
-    def load_optimizer(self, optimizer: Optimizer, path: str) -> None:
-        self.booster.load_optimizer(optimizer, path)
-
-    def setup_sampler(self, dataset) -> DistributedSampler:
-        # FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
-        return DistributedSampler(dataset, 1, 0)
-
-    @abstractmethod
-    def save_pretrained(
-        self, model: nn.Module, path: str, only_rank0: bool = True, tokenizer: Optional[PreTrainedTokenizerBase] = None
-    ) -> None:
-        pass
-
-    @abstractmethod
-    def get_model_state_dict_shard(self, model: nn.Module, **config):
-        pass
--- a/applications/Chat/coati/trainer/strategies/colossalai.py
+++ b/applications/Chat/coati/trainer/strategies/colossalai.py
@ -1,200 +0,0 @@
-import warnings
-from typing import Optional
-
-import torch.nn as nn
-
-import colossalai
-from colossalai.booster.plugin import GeminiPlugin, LowLevelZeroPlugin
-from colossalai.booster.plugin.low_level_zero_plugin import LowLevelZeroModel
-from colossalai.utils import get_current_device
-from colossalai.zero.gemini.gemini_ddp import GeminiDDP
-
-from .ddp import DDPStrategy
-
-
-class LowLevelZeroStrategy(DDPStrategy):
-    """
-        The strategy for training with ColossalAI.
-
-    Args:
-        stage(int): The stage to use in ZeRO. Choose in (1, 2)
-        precision(str): The precision to use. Choose in ('fp32', 'fp16').
-        seed(int): The seed for the random number generator.
-        placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda')
-                          If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU,
-                          If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
-        reduce_bucket_size(int): The reduce bucket size in bytes. Only for ZeRO-1 and ZeRO-2.
-        overlap_communication(bool): Whether to overlap communication and computation. Only for ZeRO-1 and ZeRO-2.
-        initial_scale(float): The initial scale for the optimizer.
-        growth_factor(float): The growth factor for the optimizer.
-        backoff_factor(float): The backoff factor for the optimizer.
-        growth_interval(int): The growth interval for the optimizer.
-        hysteresis(int): The hysteresis for the optimizer.
-        min_scale(float): The minimum scale for the optimizer.
-        max_scale(float): The maximum scale for the optimizer.
-        max_norm(float): The maximum norm for the optimizer.
-        norm_type(float): The norm type for the optimizer.
-
-    """
-
-    def __init__(
-        self,
-        stage: int = 2,
-        precision: str = "fp16",
-        seed: int = 42,
-        placement_policy: str = "cuda",
-        reduce_bucket_size: int = 12 * 1024**2,  # only for stage 1&2
-        overlap_communication: bool = True,  # only for stage 1&2
-        initial_scale: float = 2**16,
-        growth_factor: float = 2,
-        backoff_factor: float = 0.5,
-        growth_interval: int = 1000,
-        hysteresis: int = 2,
-        min_scale: float = 1,
-        max_scale: float = 2**32,
-        max_norm: float = 0.0,
-        norm_type: float = 2.0,
-    ) -> None:
-        assert stage in (1, 2), f'Unsupported stage "{stage}"'
-        assert placement_policy in ("cpu", "cuda"), f'Unsupported placement policy "{placement_policy}"'
-        assert precision in ("fp32", "fp16"), f'Unsupported precision "{precision}"'
-
-        plugin_initializer = lambda: LowLevelZeroPlugin(
-            stage=stage,
-            precision=precision,
-            reduce_bucket_size_in_m=reduce_bucket_size,
-            overlap_communication=overlap_communication,
-            cpu_offload=(placement_policy == "cpu"),
-            initial_scale=initial_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            min_scale=min_scale,
-            max_scale=max_scale,
-            max_norm=max_norm,
-            norm_type=norm_type,
-        )
-
-        super().__init__(seed, plugin_initializer)
-
-    def _post_init(self) -> None:
-        assert isinstance(
-            self.plugin, LowLevelZeroPlugin
-        ), f"{type(self).__name__}'s plugin is not initialized properly."
-
-    def setup_distributed(self) -> None:
-        colossalai.launch_from_torch({}, seed=self.seed)
-
-    def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, LowLevelZeroModel)
-        return model.module
-
-    def get_model_state_dict_shard(self, model: nn.Module, **config):
-        assert isinstance(model, LowLevelZeroModel)
-        yield from model.state_dict_shard(max_shard_size=1024, only_rank_0=False)
-
-
-class GeminiStrategy(DDPStrategy):
-    """
-        The strategy for training with ColossalAI.
-
-    Args:
-        seed(int): The seed for the random number generator.
-        shard_init(bool): Whether to shard the model parameters during initialization. Only for ZeRO-3.
-            This is not compatible with `from_pretrained()`. We temporarily disable this and will support it in the future.
-        placement_policy(str): The placement policy for gemini. Choose in ('cpu', 'cuda')
-                          If it is “cpu”, parameters, gradients and optimizer states will be offloaded to CPU,
-                          If it is “cuda”, they will not be offloaded, which means max CUDA memory will be used. It is the fastest.
-        pin_memory(bool): Whether to pin the memory for the data loader. Only for ZeRO-3.
-        force_outputs_fp32(bool): Whether to force the outputs to be fp32. Only for ZeRO-3.
-        search_range_m(int): The number of search range for the chunk size, divided by 2^20. Only for ZeRO-3.
-        hidden_dim(optional, int): The hidden dimension for the gemini. Only for ZeRO-3.
-        min_chunk_size_m(float): The minimum chunk size divided by 2^20. Only for ZeRO-3.
-        gpu_margin_mem_ratio(float): The margin memory ratio for the GPU. Only for ZeRO-3.
-        initial_scale(float): The initial scale for the optimizer.
-        growth_factor(float): The growth factor for the optimizer.
-        backoff_factor(float): The backoff factor for the optimizer.
-        growth_interval(int): The growth interval for the optimizer.
-        hysteresis(int): The hysteresis for the optimizer.
-        min_scale(float): The minimum scale for the optimizer.
-        max_scale(float): The maximum scale for the optimizer.
-        max_norm(float): The maximum norm for the optimizer.
-        norm_type(float): The norm type for the optimizer.
-
-    """
-
-    def __init__(
-        self,
-        seed: int = 42,
-        shard_init: bool = False,  # only for stage 3
-        placement_policy: str = "auto",
-        shard_param_frac: float = 1.0,  # only for static placement
-        offload_optim_frac: float = 0.0,  # only for static placement
-        offload_param_frac: float = 0.0,  # only for static placement
-        pin_memory: bool = True,  # only for stage 3
-        force_outputs_fp32: bool = False,  # only for stage 3
-        search_range_m: int = 32,  # only for stage 3
-        hidden_dim: Optional[int] = None,  # only for stage 3
-        min_chunk_size_m: float = 32,  # only for stage 3
-        gpu_margin_mem_ratio: float = 0.0,  # only for stage 3
-        initial_scale: float = 2**16,
-        growth_factor: float = 2,
-        backoff_factor: float = 0.5,
-        growth_interval: int = 1000,
-        hysteresis: int = 2,
-        min_scale: float = 1,
-        max_scale: float = 2**32,
-        max_norm: float = 0.0,
-        norm_type: float = 2.0,
-    ) -> None:
-        # TODO(ver217): support shard_init when using from_pretrained()
-        if shard_init:
-            warnings.warn(
-                f"Shard init is not supported model.from_pretrained() yet. "
-                "Please load weights after strategy.prepare()"
-            )
-        self.shard_init = shard_init
-
-        warnings.warn(f"Stage 3 only supports fp16. Precision is set to fp16.")
-
-        # NOTE: dist should be initialized before calling get_current_device()
-        plugin_initializer = lambda: GeminiPlugin(
-            chunk_init_device=get_current_device(),
-            placement_policy=placement_policy,
-            shard_param_frac=shard_param_frac,
-            offload_optim_frac=offload_optim_frac,
-            offload_param_frac=offload_param_frac,
-            precision="fp16",
-            pin_memory=pin_memory,
-            force_outputs_fp32=force_outputs_fp32,
-            strict_ddp_mode=shard_init,
-            search_range_m=search_range_m,
-            hidden_dim=hidden_dim,
-            min_chunk_size_m=min_chunk_size_m,
-            gpu_margin_mem_ratio=gpu_margin_mem_ratio,
-            initial_scale=initial_scale,
-            growth_factor=growth_factor,
-            backoff_factor=backoff_factor,
-            growth_interval=growth_interval,
-            hysteresis=hysteresis,
-            min_scale=min_scale,
-            max_scale=max_scale,
-            max_norm=max_norm,
-            norm_type=norm_type,
-        )
-
-        super().__init__(seed, plugin_initializer)
-
-    def _post_init(self) -> None:
-        assert isinstance(self.plugin, GeminiPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
-
-    def setup_distributed(self) -> None:
-        colossalai.launch_from_torch({}, seed=self.seed)
-
-    def model_init_context(self):
-        return super().model_init_context()
-
-    def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, GeminiDDP)
-        return model.module
--- a/applications/Chat/coati/trainer/strategies/ddp.py
+++ b/applications/Chat/coati/trainer/strategies/ddp.py
@ -1,136 +0,0 @@
-import os
-import random
-from collections import OrderedDict
-from typing import Callable, Optional
-
-import numpy as np
-import torch
-import torch.distributed as dist
-import torch.nn as nn
-from coati.experience_buffer import ExperienceBuffer
-from coati.models import Actor, Critic, RewardModel
-from torch.utils.data import DataLoader
-from transformers.modeling_utils import PreTrainedModel
-from transformers.tokenization_utils_base import PreTrainedTokenizerBase
-
-from colossalai.booster.plugin import TorchDDPPlugin
-from colossalai.booster.plugin.torch_ddp_plugin import TorchDDPModel
-
-from .base import Strategy
-from .sampler import DistributedSampler
-
-
-# TODO Move this to a util.py   (Moving to ray.util introduces ringed import)
-def get_grad_required_state_dict(model: nn.Module):
-    state_dict = OrderedDict()
-    for name, parameter in model.named_parameters():
-        if parameter.requires_grad:
-            state_dict[name] = parameter.detach()
-    return state_dict
-
-
-class DDPStrategy(Strategy):
-    """
-    Strategy for distributed training using torch.distributed.
-    """
-
-    def __init__(self, seed: int = 42, plugin_initializer: Callable = TorchDDPPlugin) -> None:
-        self.seed = seed
-        super().__init__(plugin_initializer)
-
-    def _try_init_dist(self, force: bool = False) -> None:
-        try:
-            rank = int(os.environ["RANK"])
-            local_rank = int(os.environ["LOCAL_RANK"])
-            world_size = int(os.environ["WORLD_SIZE"])
-            host = os.environ["MASTER_ADDR"]
-            port = int(os.environ["MASTER_PORT"])
-            dist.init_process_group("nccl", init_method=f"tcp://[{host}]:{port}", world_size=world_size, rank=rank)
-            torch.cuda.set_device(local_rank)
-        except KeyError as e:
-            if force:
-                raise RuntimeError(
-                    f"Could not find {e} in the torch environment, visit https://www.colossalai.org/ for more information on launching with torch"
-                )
-        except Exception as e:
-            if force:
-                raise e
-
-    def _post_init(self) -> None:
-        assert isinstance(self.plugin, TorchDDPPlugin), f"{type(self).__name__}'s plugin is not initialized properly."
-
-    def setup_distributed(self) -> None:
-        self._try_init_dist(force=True)
-        self.set_seed(self.seed)
-
-    def set_seed(self, seed: int) -> None:
-        random.seed(seed)
-        np.random.seed(seed)
-        torch.manual_seed(seed)
-
-    def setup_dataloader(self, data_buffer: ExperienceBuffer, pin_memory: bool = False) -> DataLoader:
-        return self.plugin.prepare_dataloader(
-            data_buffer,
-            batch_size=data_buffer.sample_batch_size,
-            shuffle=True,
-            drop_last=True,
-            pin_memory=pin_memory,
-            collate_fn=data_buffer.collate_fn,
-        )
-
-    def setup_sampler(self, dataset) -> DistributedSampler:
-        # FIXME(cwher): this is only invoked in train_on_ray, not tested after adapt Boost API.
-        return DistributedSampler(dataset, dist.get_world_size(), dist.get_rank())
-
-    def unwrap_model(self, model: nn.Module) -> nn.Module:
-        assert isinstance(model, TorchDDPModel), "model is not wrapped by TorchDDPModel."
-        return model.unwrap()
-
-    def save_pretrained(
-        self, model: nn.Module, path: str, shard: bool = False, tokenizer: Optional[PreTrainedTokenizerBase] = None
-    ) -> None:
-        if dist.get_rank() == 0:
-            unwrapped_model = self.unwrap_model(model)
-            assert isinstance(unwrapped_model, (Actor, Critic, RewardModel))
-            pretrained_model = unwrapped_model.model
-            assert isinstance(pretrained_model, PreTrainedModel)
-            # HACK: only use hf save_pretrained to save config
-            pretrained_model.save_pretrained(path, save_function=lambda *args, **kwargs: None)
-            if tokenizer is not None:
-                tokenizer.save_pretrained(path)
-
-        model_path = os.path.join(path, "pytorch_model.bin")
-        self.save_model(model, model_path, shard=shard)
-        def _replace_keys(model_path: str, replace_fn: Callable):
-            state_dict = torch.load(model_path, map_location="cpu")
-            state_dict = {replace_fn(k): v for k, v in state_dict.items()}
-            torch.save(state_dict, model_path)
-        # FIXME: save_model would add "model." prefix to keys of pytorch_model.bin
-        # HACK: rename keys of pytorch_model.bin
-        if dist.get_rank() == 0:
-            _replace_keys(model_path, lambda k: k.replace("model.", "", 1))
-
-
-    def get_model_state_dict_shard(self, model: nn.Module, **config):
-        # TODO: implement sharding on naive strategy
-        model = self.unwrap_model(model)
-        if "requires_grad_only" in config and config["requires_grad_only"] == True:
-            state_dict = get_grad_required_state_dict(model)
-        else:
-            state_dict = model.state_dict()
-
-        if "shard_size" in config:
-            shard_size = config["shard_size"]
-            accumulate_size = 0
-            state_dict_shard = OrderedDict()
-            for name, param in state_dict.items():
-                state_dict_shard[name] = param
-                accumulate_size += param.numel() * param.element_size()
-                if accumulate_size >= shard_size:
-                    accumulate_size = 0
-                    yield state_dict_shard
-                    state_dict_shard = OrderedDict()
-            if accumulate_size > 0:
-                yield state_dict_shard
-        else:
-            yield state_dict
--- a/applications/Chat/coati/trainer/strategies/sampler.py
+++ b/applications/Chat/coati/trainer/strategies/sampler.py
@ -1,31 +0,0 @@
-import math
-
-import numpy as np
-
-
-class DistributedSampler:
-    def __init__(self, dataset, num_replicas: int, rank: int) -> None:
-        self.dataset = dataset
-        self.num_replicas = num_replicas
-        self.rank = rank
-
-        if len(self.dataset) % self.num_replicas != 0:
-            self.num_samples = math.ceil(
-                (len(self.dataset) - self.num_replicas) / self.num_replicas  # type: ignore[arg-type]
-            )
-        else:
-            self.num_samples = math.ceil(len(self.dataset) / self.num_replicas)
-
-        self.total_size = self.num_samples * self.num_replicas
-
-        indices = list(range(len(self.dataset)))
-        indices = indices[: self.total_size]
-        assert len(indices) == self.total_size
-        # subsample
-        indices = indices[self.rank : self.total_size : self.num_replicas]
-        assert len(indices) == self.num_samples
-        self.indices = indices
-
-    def sample(self, batch_size: int) -> list:
-        sampled_indices = np.random.choice(self.indices, batch_size, replace=False)
-        return [self.dataset[idx] for idx in sampled_indices]
--- a/applications/Chat/coati/trainer/utils.py
+++ b/applications/Chat/coati/trainer/utils.py
@ -1,50 +0,0 @@
-from typing import Any
-
-import torch
-import torch.distributed as dist
-from torch.utils._pytree import tree_map
-from torch.utils.data import DataLoader
-
-
-class CycledDataLoader:
-    """
-    Why do we need this class?
-    In version 4da324cd60, "prompts = next(iter(self.prompt_dataloader))" is used to sample a batch of prompts/pretrain.
-    However, this may be inefficient due to frequent re-initialization of the dataloader. (re-initialize workers...)
-    NOTE: next(iter(dataloader)) is not equivalent to for batch in dataloader: break, it causes slightly different behavior.
-    """
-
-    def __init__(
-        self,
-        dataloader: DataLoader,
-    ) -> None:
-        self.dataloader = dataloader
-
-        self.count = 0
-        self.dataloader_iter = None
-
-    def next(self):
-        # defer initialization
-        if self.dataloader_iter is None:
-            self.dataloader_iter = iter(self.dataloader)
-
-        self.count += 1
-        try:
-            return next(self.dataloader_iter)
-        except StopIteration:
-            self.count = 0
-            self.dataloader_iter = iter(self.dataloader)
-            return next(self.dataloader_iter)
-
-
-def is_rank_0() -> bool:
-    return not dist.is_initialized() or dist.get_rank() == 0
-
-
-def to_device(x: Any, device: torch.device) -> Any:
-    def _to(t: Any):
-        if isinstance(t, torch.Tensor):
-            return t.to(device)
-        return t
-
-    return tree_map(_to, x)
--- a/applications/Chat/examples/README.md
+++ b/applications/Chat/examples/README.md
@ -1,409 +0,0 @@
-# Examples
-
-## Table of Contents
-
- [Examples](#examples)
-  - [Table of Contents](#table-of-contents)
-  - [Install requirements](#install-requirements)
-  - [Supervised datasets collection](#supervised-datasets-collection)
-    - [Conversation dataset generation](#conversation-dataset-generation)
-  - [Stage1 - Supervised instructs tuning](#stage1---supervised-instructs-tuning)
-    - [Arg List](#arg-list)
-  - [Stage2 - Training reward model](#stage2---training-reward-model)
-    - [Features and tricks in RM training](#features-and-tricks-in-rm-training)
-    - [Experiment result](#experiment-result)
-    - [Arg List](#arg-list-1)
-  - [Stage3 - Training model using prompts with RL](#stage3---training-model-using-prompts-with-rl)
-    - [Arg List](#arg-list-2)
-  - [Inference example - After Stage3](#inference-example---after-stage3)
-  - [Attention](#attention)
-    - [data](#data)
-  - [Support Model](#support-model)
-    - [GPT](#gpt)
-    - [BLOOM](#bloom)
-    - [OPT](#opt)
-    - [LLaMA](#llama)
-  - [Add your own models](#add-your-own-models)
-    - [Actor model](#actor-model)
-    - [Reward model](#reward-model)
-    - [Critic model](#critic-model)
-
---
-
-## Install requirements
-
-```shell
-pip install -r requirements.txt
-```
-
-## Supervised datasets collection
-
-We collected 104K bilingual datasets of Chinese and English, and you can find the datasets in this repo
-[InstructionWild](https://github.com/XueFuzhao/InstructionWild) and in this [file](https://github.com/XueFuzhao/InstructionWild/blob/main/data/README.md).
-
-Here is how we collected the data
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/data-collect.png" width=500/>
-</p>
-
-### Conversation dataset generation
-
-In order to further improve the model's ability to handle multi-turn conversations, we need to include samples with multi-turn conversations in the dataset. However, the samples in InstructWild and Alpaca datasets currently consist of only single-turn conversations, and their dataset organization is not suitable for storing multi-turn conversations. Additionally, after converting the aforementioned datasets, we also need to include multi-turn conversation datasets like ShareGPT, and we should transform them into the training format supported by ColossalChat.
-
-A sample of conversation dataset should have the following fields:
-
- `type` (str, optional): The type of the data sample.
- `language` (str, optional): The language of the data sample.
- `dataset` (str, optional): The dataset the data sample originates from.
- `conversations` (str, compulsory): Conversation content of the data sample.
- `id` (int, optional): The ID of the data sample.
-
-A simple example:
-
-```json
-{
-  "type": "instruction",
-  "language": "English",
-  "dataset": "Alpaca",
-  "conversations": [
-    {
-      "from": "human",
-      "value": "Give three tips for staying healthy."
-    },
-    {
-      "from": "gpt",
-      "value": "1.Eat a balanced diet and make sure to include plenty of fruits and vegetables. \n2. Exercise regularly to keep your body active and strong. \n3. Get enough sleep and maintain a consistent sleep schedule."
-    }
-  ],
-  "id": 1
-}
-```
-
-> **NOTE:** Only key `conversations` is compulsary for training and other keys serve as metadata. The length of `conversations` varies.
-
-You can run the `examples/generate_conversation_dataset.py` to generate a conversation dataset supported by ColossalChat.
-
-You can use the following cmd to generate conversation dataset.
-
-```bash
-python generate_conversation_dataset.py \
-    --dataset "All"
-    --save_path "/path/to/dataset"
-```
-
-## Stage1 - Supervised instructs tuning
-
-Stage1 is supervised instructs fine-tuning, which uses the datasets mentioned earlier to fine-tune the model.
-[[Stage1 tutorial video]](https://www.youtube.com/watch?v=-qFBZFmOJfg)
-
-You can run the `examples/train_sft.sh` to start a supervised instructs fine-tuning.
-
-You can also use the following cmd to start a supervised instructs fine-tuning with your own settings.
-
-```bash
-torchrun --standalone --nproc_per_node=4 train_sft.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2 \
-    --save_path  /path/to/Coati-7B \
-    --dataset /path/to/data.json \
-    --batch_size 4 \
-    --accumulation_steps 8 \
-    --lr 2e-5 \
-    --max_datasets_size 512 \
-    --max_epochs 1 \
-    --grad_checkpoint
-```
-
-**Note**: the supervised dataset follows the following format,
-
-```json
-[
-    {
-        "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-        "input": "",
-        "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-        "id": 0
-    },
-    ...
-]
-```
-
-### Arg List
-
- `--strategy`: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
- `--model`: model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
- `--pretrain`: pretrain model, type=str, default=None
- `--max_datasets_size`: the max size of dataset, type=int, default=None
- `--save_path`: path to save the model, type=str, default='output'
- `--need_optim_ckpt`: whether to save optim ckpt, type=bool, default=False
- `--max_epochs`: max epochs for training, type=int, default=3
- `--batch_size`: batch size while training, type=int, default=4
- `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
- `--grad_checkpoint`: enable gradient checkpointing, type=bool, default=False
-
-## Stage2 - Training reward model
-
-We train a reward model in stage 2, which obtains corresponding scores by manually ranking different outputs for the same prompt and supervises the training of the reward model.
-[[Stage2 tutorial video]](https://www.youtube.com/watch?v=gMx2CApKhuo)
-
-You can run the `examples/train_rm.sh` to start a reward model training.
-
-You can also use the following cmd to start training a reward model.
-
-```bash
-torchrun --standalone --nproc_per_node=4 train_reward_model.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2 \
-    --loss_fn 'log_exp'\
-    --save_path 'rmstatic.pt' \
-```
-
-### Features and tricks in RM training
-
- We support [Anthropic/hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)and[rm-static](https://huggingface.co/datasets/Dahoas/rm-static) datasets.
- We support 2 kinds of loss function named `log_sig`(used by OpenAI) and `log_exp`(used by Anthropic).
- We change the loss to `valid_acc` and `pair_dist` to monitor progress during training.
- We add special token to the end of the sequence to get better result.
- We use cosine-reducing lr-scheduler for RM training.
- We set value_head as 1 liner layer and initialize the weight of value_head using N(0，1/(d_model + 1)) distribution.
- We train a Bloom-560m reward model for 1 epoch and find the test acc of the model achieve the performance mentions in [Anthropics paper](https://arxiv.org/abs/2204.05862).
-
-### Experiment result
-
-Model performance in [Anthropics paper](https://arxiv.org/abs/2204.05862):
-
-<div align=middle> <img width="512" alt="image" src="https://user-images.githubusercontent.com/70618399/225263321-8d64c3a8-6877-4cc8-9b61-0e1c52d3d94f.png">
-
-<div align=left>Our training & test result of bloom-560m for 1 epoch:
-
-<div align=middle> <img width="512" alt="image" src="https://user-images.githubusercontent.com/70618399/225262950-a7f0a686-25de-44ec-98f2-11b83ea86674.png">
-
-<div align=left>We also train the reward model based on LLaMA-7B, which reaches the ACC of 72.06% after 1 epoch, performing almost the same as Anthropic's best RM.
-
-### Arg List
-
- `--strategy`: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
- `--model`: model type, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
- `--pretrain`: pretrain model, type=str, default=None
- `--model_path`: the path of rm model(if continue to train), type=str, default=None
- `--save_path`: path to save the model, type=str, default='output'
- `--need_optim_ckpt`: whether to save optim ckpt, type=bool, default=False
- `--max_epochs`: max epochs for training, type=int, default=3
- `--dataset`: dataset name, type=str, choices=['Anthropic/hh-rlhf', 'Dahoas/rm-static']
- `--subset`: subset of the dataset, type=str, default=None
- `--batch_size`: batch size while training, type=int, default=4
- `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
- `--loss_func`: which kind of loss function, choices=['log_sig', 'log_exp']
- `--max_len`: max sentence length for generation, type=int, default=512
-
-## Stage3 - Training model using prompts with RL
-
-Stage3 uses reinforcement learning algorithm, which is the most complex part of the training process, as shown below:
-
-<p align="center">
-<img src="https://raw.githubusercontent.com/hpcaitech/public_assets/main/applications/chat/stage-3.jpeg" width=800/>
-</p>
-
-You can run the `examples/train_prompts.sh` to start PPO training.
-
-You can also use the cmd following to start PPO training.
-[[Stage3 tutorial video]](https://www.youtube.com/watch?v=Z8wwSHxPL9g)
-
-```bash
-torchrun --standalone --nproc_per_node=4 train_prompts.py \
-    --pretrain "/path/to/LLaMa-7B/" \
-    --model 'llama' \
-    --strategy colossalai_zero2 \
-    --prompt_dataset /path/to/your/prompt_dataset \
-    --pretrain_dataset /path/to/your/pretrain_dataset \
-    --rm_pretrain /your/pretrain/rm/definition \
-    --rm_path /your/rm/model/path
-```
-
-Prompt dataset: the instruction dataset mentioned in the above figure which includes the instructions, e.g. you can use the [script](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/examples/generate_prompt_dataset.py) which samples `instinwild_en.json` or `instinwild_ch.json` in [InstructionWild](https://github.com/XueFuzhao/InstructionWild/tree/main/data#instructwild-data) to generate the prompt dataset.
-Pretrain dataset: the pretrain dataset including the instruction and corresponding response, e.g. you can use the [InstructWild Data](https://github.com/XueFuzhao/InstructionWild/tree/main/data) in stage 1 supervised instructs tuning.
-
-**Note**: the required datasets follow the following format,
-
- `pretrain dataset`
-
-  ```json
-  [
-      {
-          "instruction": "Provide a list of the top 10 most popular mobile games in Asia",
-          "input": "",
-          "output": "The top 10 most popular mobile games in Asia are:\n1) PUBG Mobile\n2) Pokemon Go\n3) Candy Crush Saga\n4) Free Fire\n5) Clash of Clans\n6) Mario Kart Tour\n7) Arena of Valor\n8) Fantasy Westward Journey\n9) Subway Surfers\n10) ARK Survival Evolved",
-          "id": 0
-      },
-      ...
-  ]
-  ```
-
- `prompt dataset`
-
-  ```json
-  [
-      {
-          "instruction": "Edit this paragraph to make it more concise: \"Yesterday, I went to the store and bought some things. Then, I came home and put them away. After that, I went for a walk and met some friends.\"",
-          "id": 0
-      },
-      {
-          "instruction": "Write a descriptive paragraph about a memorable vacation you went on",
-          "id": 1
-      },
-      ...
-  ]
-  ```
-
-### Arg List
-
- `--strategy`: the strategy using for training, choices=['ddp', 'colossalai_gemini', 'colossalai_zero2'], default='colossalai_zero2'
- `--model`: model type of actor, choices=['gpt2', 'bloom', 'opt', 'llama'], default='bloom'
- `--pretrain`: pretrain model, type=str, default=None
- `--rm_model`: reward model type, type=str, choices=['gpt2', 'bloom', 'opt', 'llama'], default=None
- `--rm_pretrain`: pretrain model for reward model, type=str, default=None
- `--rm_path`: the path of rm model, type=str, default=None
- `--save_path`: path to save the model, type=str, default='output'
- `--prompt_dataset`: path of the prompt dataset, type=str, default=None
- `--pretrain_dataset`: path of the ptx dataset, type=str, default=None
- `--need_optim_ckpt`: whether to save optim ckpt, type=bool, default=False
- `--num_episodes`: num of episodes for training, type=int, default=10
- `--num_update_steps`: number of steps to update policy per episode, type=int
- `--num_collect_steps`: number of steps to collect experience per episode, type=int
- `--train_batch_size`: batch size while training, type=int, default=8
- `--ptx_batch_size`: batch size to compute ptx loss, type=int, default=1
- `--experience_batch_size`: batch size to make experience, type=int, default=8
- `--lora_rank`: low-rank adaptation matrices rank, type=int, default=0
- `--kl_coef`: kl_coef using for computing reward, type=float, default=0.1
- `--ptx_coef`: ptx_coef using for computing policy loss, type=float, default=0.9
-
-## Inference example - After Stage3
-
-We support different inference options, including int8 and int4 quantization.
-For details, see [`inference/`](https://github.com/hpcaitech/ColossalAI/tree/main/applications/Chat/inference).
-
-## Attention
-
-The examples are demos for the whole training process.You need to change the hyper-parameters to reach great performance.
-
-#### data
-
- [x] [rm-static](https://huggingface.co/datasets/Dahoas/rm-static)
- [x] [hh-rlhf](https://huggingface.co/datasets/Anthropic/hh-rlhf)
- [ ] [openai/summarize_from_feedback](https://huggingface.co/datasets/openai/summarize_from_feedback)
- [ ] [openai/webgpt_comparisons](https://huggingface.co/datasets/openai/webgpt_comparisons)
- [ ] [Dahoas/instruct-synthetic-prompt-responses](https://huggingface.co/datasets/Dahoas/instruct-synthetic-prompt-responses)
-
-## Support Model
-
-### GPT
-
- [x] GPT2-S (s)
- [x] GPT2-M (m)
- [x] GPT2-L (l)
- [x] GPT2-XL (xl)
- [x] GPT2-4B (4b)
- [ ] GPT2-6B (6b)
-
-### BLOOM
-
- [x] [BLOOM-560m](https://huggingface.co/bigscience/bloom-560m)
- [x] [BLOOM-1b1](https://huggingface.co/bigscience/bloom-1b1)
- [x] [BLOOM-3b](https://huggingface.co/bigscience/bloom-3b)
- [x] [BLOOM-7b](https://huggingface.co/bigscience/bloom-7b1)
- [ ] [BLOOM-175b](https://huggingface.co/bigscience/bloom)
-
-### OPT
-
- [x] [OPT-125M](https://huggingface.co/facebook/opt-125m)
- [x] [OPT-350M](https://huggingface.co/facebook/opt-350m)
- [x] [OPT-1.3B](https://huggingface.co/facebook/opt-1.3b)
- [x] [OPT-2.7B](https://huggingface.co/facebook/opt-2.7b)
- [x] [OPT-6.7B](https://huggingface.co/facebook/opt-6.7b)
- [ ] [OPT-13B](https://huggingface.co/facebook/opt-13b)
- [ ] [OPT-30B](https://huggingface.co/facebook/opt-30b)
-
-### [LLaMA](https://github.com/facebookresearch/llama/blob/main/MODEL_CARD.md)
-
- [x] LLaMA-7B
- [x] LLaMA-13B
- [ ] LLaMA-33B
- [ ] LLaMA-65B
-
-## Add your own models
-
-If you want to support your own model in Coati, please refer the pull request for RoBERTa support as an example --[[chatgpt] add pre-trained model RoBERTa for RLHF stage 2 & 3](https://github.com/hpcaitech/ColossalAI/pull/3223), and submit a PR to us.
-
-You should complete the implementation of four model classes, including Reward model, Critic model, LM model, Actor model
-
-here are some example code for a NewModel named `Coati`.
-if it is supported in huggingface [transformers](https://github.com/huggingface/transformers), you can load it by `from_pretrained`, o
-r you can build your own model by yourself.
-
-### Actor model
-
-```python
-from ..base import Actor
-from transformers.models.coati import CoatiModel
-
-class CoatiActor(Actor):
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = CoatiModel.from_pretrained(pretrained)
-        else:
-            model = build_model() # load your own model if it is not support in transformers
-
-        super().__init__(model, lora_rank, lora_train_bias)
-```
-
-### Reward model
-
-```python
-from ..base import RewardModel
-from transformers.models.coati import CoatiModel
-
-class CoatiRM(RewardModel):
-
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = CoatiModel.from_pretrained(pretrained)
-        else:
-            model = build_model() # load your own model if it is not support in transformers
-
-        value_head = nn.Linear(model.config.n_embd, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.n_embd + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
-```
-
-### Critic model
-
-```python
-from ..base import Critic
-from transformers.models.coati import CoatiModel
-
-class CoatiCritic(Critic):
-    def __init__(self,
-                 pretrained: Optional[str] = None,
-                 checkpoint: bool = False,
-                 lora_rank: int = 0,
-                 lora_train_bias: str = 'none') -> None:
-        if pretrained is not None:
-            model = CoatiModel.from_pretrained(pretrained)
-        else:
-            model = build_model() # load your own model if it is not support in transformers
-
-        value_head = nn.Linear(model.config.n_embd, 1)
-        value_head.weight.data.normal_(mean=0.0, std=1 / (model.config.n_embd + 1))
-        super().__init__(model, value_head, lora_rank, lora_train_bias)
-```
--- a/applications/Chat/examples/download_model.py
+++ b/applications/Chat/examples/download_model.py
@ -1,79 +0,0 @@
-import argparse
-import dataclasses
-import os
-import parser
-from typing import List
-
-import tqdm
-from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.opt import OPTRM, OPTActor, OPTCritic
-from huggingface_hub import hf_hub_download, snapshot_download
-from transformers import AutoConfig, AutoTokenizer, BloomConfig, BloomTokenizerFast, GPT2Config, GPT2Tokenizer
-
-
-@dataclasses.dataclass
-class HFRepoFiles:
-    repo_id: str
-    files: List[str]
-
-    def download(self, dir_path: str):
-        for file in self.files:
-            file_path = hf_hub_download(self.repo_id, file, local_dir=dir_path)
-
-    def download_all(self):
-        snapshot_download(self.repo_id)
-
-
-def test_init(model: str, dir_path: str):
-    if model == "gpt2":
-        config = GPT2Config.from_pretrained(dir_path)
-        actor = GPTActor(config=config)
-        critic = GPTCritic(config=config)
-        reward_model = GPTRM(config=config)
-        GPT2Tokenizer.from_pretrained(dir_path)
-    elif model == "bloom":
-        config = BloomConfig.from_pretrained(dir_path)
-        actor = BLOOMActor(config=config)
-        critic = BLOOMCritic(config=config)
-        reward_model = BLOOMRM(config=config)
-        BloomTokenizerFast.from_pretrained(dir_path)
-    elif model == "opt":
-        config = AutoConfig.from_pretrained(dir_path)
-        actor = OPTActor(config=config)
-        critic = OPTCritic(config=config)
-        reward_model = OPTRM(config=config)
-        AutoTokenizer.from_pretrained(dir_path)
-    else:
-        raise NotImplementedError(f"Model {model} not implemented")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model-dir", type=str, default="test_models")
-    parser.add_argument("--config-only", default=False, action="store_true")
-    args = parser.parse_args()
-
-    if os.path.exists(args.model_dir):
-        print(f"[INFO]: {args.model_dir} already exists")
-        exit(0)
-
-    repo_list = {
-        "gpt2": HFRepoFiles(repo_id="gpt2", files=["config.json", "tokenizer.json", "vocab.json", "merges.txt"]),
-        "bloom": HFRepoFiles(
-            repo_id="bigscience/bloom-560m", files=["config.json", "tokenizer.json", "tokenizer_config.json"]
-        ),
-        "opt": HFRepoFiles(
-            repo_id="facebook/opt-350m", files=["config.json", "tokenizer_config.json", "vocab.json", "merges.txt"]
-        ),
-    }
-
-    os.mkdir(args.model_dir)
-    for model_name in tqdm.tqdm(repo_list):
-        dir_path = os.path.join(args.model_dir, model_name)
-        if args.config_only:
-            os.mkdir(dir_path)
-            repo_list[model_name].download(dir_path)
-        else:
-            repo_list[model_name].download_all()
-        test_init(model_name, dir_path)
--- a/applications/Chat/examples/generate_conversation_dataset.py
+++ b/applications/Chat/examples/generate_conversation_dataset.py
@ -1,82 +0,0 @@
-import argparse
-import json
-
-from datasets import load_dataset
-
-
-def generate_alpaca():
-    # We can convert dataset with the same format("instruction", "input", "output") as Alpaca into a one-round conversation.
-    conversation_dataset = []
-    dataset = load_dataset("tatsu-lab/alpaca", split="train")
-
-    instructions = dataset["instruction"]
-    inputs = dataset["input"]
-    outputs = dataset["output"]
-
-    assert len(instructions) == len(inputs) == len(outputs)
-
-    for idx in range(len(instructions)):
-        human_utterance = instructions[idx] + "\n\n" + inputs[idx] if inputs[idx] else instructions[idx]
-        human = {"from": "human", "value": human_utterance}
-
-        gpt_utterance = outputs[idx]
-        gpt = {"from": "gpt", "value": gpt_utterance}
-
-        conversation = dict(type="instruction", language="English", dataset="Alpaca", conversations=[human, gpt])
-        conversation_dataset.append(conversation)
-
-    return conversation_dataset
-
-
-def generate_sharegpt():
-    # ShareGPT data requires less processing.
-    conversation_dataset = []
-    dataset = load_dataset(
-        "anon8231489123/ShareGPT_Vicuna_unfiltered",
-        data_files="ShareGPT_V3_unfiltered_cleaned_split_no_imsorry.json",
-        split="train",
-    )
-
-    conversations = dataset["conversations"]
-
-    for idx in range(len(conversations)):
-        for conv in conversations[idx]:
-            # We don't need markdown and text value.
-            del conv["markdown"]
-            del conv["text"]
-
-        conversation = dict(
-            type="conversation", language="Multilingual", dataset="ShareGPT", conversations=conversations[idx]
-        )
-        conversation_dataset.append(conversation)
-
-    return conversation_dataset
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        default="All",
-        choices=["Alpaca", "ShareGPT", "All"],
-        help="which dataset to convert, All will combine Alpaca and ShareGPT",
-    )
-    parser.add_argument("--save_path", type=str, default="dataset.json", help="path to save the converted dataset")
-    args = parser.parse_args()
-
-    conversation_dataset = []
-
-    if args.dataset == "Alpaca":
-        conversation_dataset.extend(generate_alpaca())
-    elif args.dataset == "ShareGPT":
-        conversation_dataset.extend(generate_sharegpt())
-    else:
-        conversation_dataset.extend(generate_alpaca())
-        conversation_dataset.extend(generate_sharegpt())
-
-    for idx, sample in enumerate(conversation_dataset):
-        sample["id"] = idx + 1
-
-    with open(args.save_path, mode="w") as f:
-        json.dump(conversation_dataset, f, indent=4, default=str, ensure_ascii=False)
--- a/applications/Chat/examples/generate_prompt_dataset.py
+++ b/applications/Chat/examples/generate_prompt_dataset.py
@ -1,27 +0,0 @@
-import argparse
-import json
-import random
-
-random.seed(42)
-
-
-def sample(args):
-    with open(args.dataset_path, mode="r") as f:
-        dataset_list = json.load(f)
-
-    sampled_dataset = [
-        {"instruction": sample["instruction"], "id": idx}
-        for idx, sample in enumerate(random.sample(dataset_list, args.sample_size))
-    ]
-
-    with open(args.save_path, mode="w") as f:
-        json.dump(sampled_dataset, f, indent=4, default=str, ensure_ascii=False)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--dataset_path", type=str, default=None, required=True, help="path to the pretrain dataset")
-    parser.add_argument("--save_path", type=str, default="prompt.json", help="path to save the prompt dataset")
-    parser.add_argument("--sample_size", type=int, default=16384, help="size of the prompt dataset")
-    args = parser.parse_args()
-    sample(args)
--- a/applications/Chat/examples/inference.py
+++ b/applications/Chat/examples/inference.py
@ -1,73 +0,0 @@
-import argparse
-
-import torch
-from coati.models.bloom import BLOOMActor
-from coati.models.generation import generate
-from coati.models.gpt import GPTActor
-from coati.models.llama import LlamaActor
-from coati.models.opt import OPTActor
-from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
-
-
-def eval(args):
-    # configure model
-    if args.model == "gpt2":
-        actor = GPTActor(pretrained=args.pretrain)
-    elif args.model == "bloom":
-        actor = BLOOMActor(pretrained=args.pretrain)
-    elif args.model == "opt":
-        actor = OPTActor(pretrained=args.pretrain)
-    elif args.model == "llama":
-        actor = LlamaActor(pretrained=args.pretrain)
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    actor.to(torch.cuda.current_device())
-    if args.model_path is not None:
-        state_dict = torch.load(args.model_path)
-        actor.load_state_dict(state_dict)
-
-    # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained("bigscience/bloom-560m")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m")
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained("hf-internal-testing/llama-tokenizer")
-        tokenizer.eos_token = "<\s>"
-        tokenizer.pad_token = tokenizer.unk_token
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    actor.eval()
-    tokenizer.padding_side = "left"
-    input_ids = tokenizer.encode(args.input, return_tensors="pt").to(torch.cuda.current_device())
-    outputs = generate(
-        actor,
-        input_ids,
-        tokenizer=tokenizer,
-        max_length=args.max_length,
-        do_sample=True,
-        top_k=50,
-        top_p=0.95,
-        num_return_sequences=1,
-    )
-    output = tokenizer.batch_decode(outputs[0], skip_special_tokens=True)
-    print(f"[Output]: {''.join(output)}")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    # We suggest to use the pretrained model from HuggingFace, use pretrain to configure model
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--model_path", type=str, default=None)
-    parser.add_argument("--input", type=str, default="Question: How are you ? Answer:")
-    parser.add_argument("--max_length", type=int, default=100)
-    args = parser.parse_args()
-    eval(args)
--- a/applications/Chat/examples/ray/1mmt_prompt.py
+++ b/applications/Chat/examples/ray/1mmt_prompt.py
@ -1,181 +0,0 @@
-import argparse
-import os
-import socket
-from functools import partial
-
-import pandas as pd
-import ray
-from coati.quant import llama_load_quant, low_resource_init
-from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
-from coati.ray.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.utils import (
-    get_actor_from_args,
-    get_critic_from_args,
-    get_reward_model_from_args,
-    get_strategy_from_args,
-    get_tokenizer_from_args,
-)
-from torch.utils.data import DataLoader
-from transformers import AutoConfig
-from transformers.modeling_utils import no_init_weights
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(("8.8.8.8", 80))
-        return s.getsockname()[0]
-
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainers = [
-        {
-            "local_rank": "0",
-            "rank": str(rank),
-            "world_size": str(args.num_trainers),
-            "master_port": trainer_port,
-            "master_addr": master_addr,
-        }
-        for rank in range(args.num_trainers)
-    ]
-
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_maker = {
-        "local_rank": "0",
-        "rank": "0",
-        "world_size": "1",
-        "master_port": maker_port,
-        "master_addr": master_addr,
-    }
-
-    # configure tokenizer
-    tokenizer = get_tokenizer_from_args(args.model)
-
-    def trainer_model_fn():
-        actor = get_actor_from_args(args.model, args.pretrain).half().cuda()
-        critic = get_critic_from_args(args.model, args.critic_pretrain).half().cuda()
-        return actor, critic
-
-    # configure Trainer
-    trainer_refs = [
-        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
-            experience_maker_holder_name_list=["maker1"],
-            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
-            model_fn=trainer_model_fn,
-            env_info=env_info_trainer,
-            train_batch_size=args.train_batch_size,
-            buffer_limit=16,
-            eval_performance=True,
-            debug=args.debug,
-            update_lora_weights=not (args.lora_rank == 0),
-        )
-        for i, env_info_trainer in enumerate(env_info_trainers)
-    ]
-
-    def model_fn():
-        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
-        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
-        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
-        if args.initial_model_quant_ckpt is not None and args.model == "llama":
-            # quantize initial model
-            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
-            with low_resource_init(), no_init_weights():
-                initial_model = get_actor_from_args(args.model, config=actor_cfg)
-            initial_model.model = (
-                llama_load_quant(
-                    initial_model.model, args.initial_model_quant_ckpt, args.quant_bits, args.quant_group_size
-                )
-                .cuda()
-                .requires_grad_(False)
-            )
-        else:
-            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
-        return actor, critic, reward_model, initial_model
-
-    # configure Experience Maker
-    experience_holder_ref = ExperienceMakerHolder.options(name="maker1", num_gpus=1, max_concurrency=2).remote(
-        detached_trainer_name_list=[f"trainer{i}" for i in range(args.num_trainers)],
-        strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
-        model_fn=model_fn,
-        env_info=env_info_maker,
-        experience_batch_size=args.experience_batch_size,
-        kl_coef=0.1,
-        debug=args.debug,
-        update_lora_weights=not (args.lora_rank == 0),
-        # sync_models_from_trainers=True,
-        # generation kwargs:
-        max_length=512,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-        eval_performance=True,
-        use_cache=True,
-    )
-
-    # uncomment this function if sync_models_from_trainers is True
-    # ray.get([
-    #     trainer_ref.sync_models_to_remote_makers.remote()
-    #     for trainer_ref in trainer_refs
-    # ])
-
-    wait_tasks = []
-
-    total_steps = args.experience_batch_size * args.experience_steps // (args.num_trainers * args.train_batch_size)
-    for trainer_ref in trainer_refs:
-        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
-
-    dataset_size = args.experience_batch_size * 4
-
-    def build_dataloader():
-        def tokenize_fn(texts):
-            batch = tokenizer(texts, return_tensors="pt", max_length=96, padding="max_length", truncation=True)
-            return {k: v.cuda() for k, v in batch.items()}
-
-        dataset = pd.read_csv(args.prompt_path)["prompt"]
-        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
-        return dataloader
-
-    wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
-
-    ray.get(wait_tasks)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt_path", type=str, default=None)
-    parser.add_argument("--num_trainers", type=int, default=1)
-    parser.add_argument(
-        "--trainer_strategy",
-        choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_gemini_cpu", "colossalai_zero2_cpu"],
-        default="ddp",
-    )
-    parser.add_argument("--maker_strategy", choices=["naive"], default="naive")
-    parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--critic_model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--critic_pretrain", type=str, default=None)
-    parser.add_argument("--experience_steps", type=int, default=4)
-    parser.add_argument("--experience_batch_size", type=int, default=8)
-    parser.add_argument("--train_epochs", type=int, default=1)
-    parser.add_argument("--update_steps", type=int, default=2)
-    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument("--initial_model_quant_ckpt", type=str, default=None)
-    parser.add_argument("--quant_bits", type=int, default=4)
-    parser.add_argument("--quant_group_size", type=int, default=128)
-    parser.add_argument("--debug", action="store_true")
-    args = parser.parse_args()
-    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
-    main(args)
--- a/applications/Chat/examples/ray/mmmt_prompt.py
+++ b/applications/Chat/examples/ray/mmmt_prompt.py
@ -1,201 +0,0 @@
-import argparse
-import os
-import socket
-from functools import partial
-
-import pandas as pd
-import ray
-from coati.quant import llama_load_quant, low_resource_init
-from coati.ray.detached_trainer_ppo import DetachedPPOTrainer
-from coati.ray.experience_maker_holder import ExperienceMakerHolder
-from coati.ray.utils import (
-    get_actor_from_args,
-    get_critic_from_args,
-    get_receivers_per_sender,
-    get_reward_model_from_args,
-    get_strategy_from_args,
-)
-from torch.utils.data import DataLoader
-from transformers import AutoConfig, AutoTokenizer
-from transformers.modeling_utils import no_init_weights
-
-
-def get_free_port():
-    with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
-        s.bind(("", 0))
-        return s.getsockname()[1]
-
-
-def get_local_ip():
-    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
-        s.connect(("8.8.8.8", 80))
-        return s.getsockname()[0]
-
-
-def main(args):
-    master_addr = str(get_local_ip())
-    # trainer_env_info
-    trainer_port = str(get_free_port())
-    env_info_trainers = [
-        {
-            "local_rank": "0",
-            "rank": str(rank),
-            "world_size": str(args.num_trainers),
-            "master_port": trainer_port,
-            "master_addr": master_addr,
-        }
-        for rank in range(args.num_trainers)
-    ]
-
-    # maker_env_info
-    maker_port = str(get_free_port())
-    env_info_makers = [
-        {
-            "local_rank": "0",
-            "rank": str(rank),
-            "world_size": str(args.num_makers),
-            "master_port": maker_port,
-            "master_addr": master_addr,
-        }
-        for rank in range(args.num_makers)
-    ]
-
-    # configure tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(args.pretrain)
-    tokenizer.pad_token = tokenizer.eos_token
-
-    def model_fn():
-        actor = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
-        critic = get_critic_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
-        reward_model = get_reward_model_from_args(args.model, args.critic_pretrain).requires_grad_(False).half().cuda()
-        if args.initial_model_quant_ckpt is not None and args.model == "llama":
-            # quantize initial model
-            actor_cfg = AutoConfig.from_pretrained(args.pretrain)
-            with low_resource_init(), no_init_weights():
-                initial_model = get_actor_from_args(args.model, config=actor_cfg)
-            initial_model.model = (
-                llama_load_quant(
-                    initial_model.model, args.initial_model_quant_ckpt, args.quant_bits, args.quant_group_size
-                )
-                .cuda()
-                .requires_grad_(False)
-            )
-        else:
-            initial_model = get_actor_from_args(args.model, args.pretrain).requires_grad_(False).half().cuda()
-        return actor, critic, reward_model, initial_model
-
-    # configure Experience Maker
-    experience_holder_refs = [
-        ExperienceMakerHolder.options(name=f"maker{i}", num_gpus=1, max_concurrency=2).remote(
-            detached_trainer_name_list=[
-                f"trainer{x}"
-                for x in get_receivers_per_sender(i, args.num_makers, args.num_trainers, allow_idle_sender=False)
-            ],
-            strategy_fn=partial(get_strategy_from_args, args.maker_strategy),
-            model_fn=model_fn,
-            env_info=env_info_maker,
-            kl_coef=0.1,
-            debug=args.debug,
-            update_lora_weights=not (args.lora_rank == 0),
-            # sync_models_from_trainers=True,
-            # generation kwargs:
-            max_length=512,
-            do_sample=True,
-            temperature=1.0,
-            top_k=50,
-            pad_token_id=tokenizer.pad_token_id,
-            eos_token_id=tokenizer.eos_token_id,
-            eval_performance=True,
-            use_cache=True,
-        )
-        for i, env_info_maker in enumerate(env_info_makers)
-    ]
-
-    def trainer_model_fn():
-        actor = get_actor_from_args(args.model, args.pretrain, lora_rank=args.lora_rank).half().cuda()
-        critic = get_critic_from_args(args.model, args.critic_pretrain, lora_rank=args.lora_rank).half().cuda()
-        return actor, critic
-
-    # configure Trainer
-    trainer_refs = [
-        DetachedPPOTrainer.options(name=f"trainer{i}", num_gpus=1, max_concurrency=2).remote(
-            experience_maker_holder_name_list=[
-                f"maker{x}"
-                for x in get_receivers_per_sender(i, args.num_trainers, args.num_makers, allow_idle_sender=True)
-            ],
-            strategy_fn=partial(get_strategy_from_args, args.trainer_strategy),
-            model_fn=trainer_model_fn,
-            env_info=env_info_trainer,
-            train_batch_size=args.train_batch_size,
-            buffer_limit=16,
-            eval_performance=True,
-            debug=args.debug,
-            update_lora_weights=not (args.lora_rank == 0),
-        )
-        for i, env_info_trainer in enumerate(env_info_trainers)
-    ]
-
-    dataset_size = args.experience_batch_size * 4
-
-    def build_dataloader():
-        def tokenize_fn(texts):
-            batch = tokenizer(texts, return_tensors="pt", max_length=96, padding="max_length", truncation=True)
-            return {k: v.cuda() for k, v in batch.items()}
-
-        dataset = pd.read_csv(args.prompt_path)["prompt"]
-        dataloader = DataLoader(dataset=dataset, batch_size=dataset_size, shuffle=True, collate_fn=tokenize_fn)
-        return dataloader
-
-    # uncomment this function if sync_models_from_trainers is True
-    # ray.get([
-    #     trainer_ref.sync_models_to_remote_makers.remote()
-    #     for trainer_ref in trainer_refs
-    # ])
-
-    wait_tasks = []
-
-    for experience_holder_ref in experience_holder_refs:
-        wait_tasks.append(experience_holder_ref.workingloop.remote(build_dataloader, num_steps=args.experience_steps))
-
-    total_steps = (
-        args.experience_batch_size
-        * args.experience_steps
-        * args.num_makers
-        // (args.num_trainers * args.train_batch_size)
-    )
-    for trainer_ref in trainer_refs:
-        wait_tasks.append(trainer_ref.fit.remote(total_steps, args.update_steps, args.train_epochs))
-
-    ray.get(wait_tasks)
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt_path", type=str, default=None)
-    parser.add_argument("--num_makers", type=int, default=1)
-    parser.add_argument("--num_trainers", type=int, default=1)
-    parser.add_argument(
-        "--trainer_strategy",
-        choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_gemini_cpu", "colossalai_zero2_cpu"],
-        default="ddp",
-    )
-    parser.add_argument("--maker_strategy", choices=["naive"], default="naive")
-    parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--critic_model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--critic_pretrain", type=str, default=None)
-    parser.add_argument("--experience_steps", type=int, default=4)
-    parser.add_argument("--experience_batch_size", type=int, default=8)
-    parser.add_argument("--train_epochs", type=int, default=1)
-    parser.add_argument("--update_steps", type=int, default=2)
-    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-
-    parser.add_argument("--initial_model_quant_ckpt", type=str, default=None)
-    parser.add_argument("--quant_bits", type=int, default=4)
-    parser.add_argument("--quant_group_size", type=int, default=128)
-    parser.add_argument("--debug", action="store_true")
-    args = parser.parse_args()
-
-    ray.init(namespace=os.environ["RAY_NAMESPACE"], runtime_env={"env_vars": dict(os.environ)})
-    main(args)
--- a/applications/Chat/examples/ray/requirements.txt
+++ b/applications/Chat/examples/ray/requirements.txt
@ -1 +0,0 @@
-ray
--- a/applications/Chat/examples/ray/test_ci.sh
+++ b/applications/Chat/examples/ray/test_ci.sh
@ -1,12 +0,0 @@
-#!/bin/bash
-
-set -xe
-BASE=$(realpath $(dirname $0))
-
-export RAY_NAMESPACE=admin
-export DATA=/data/scratch/chatgpt/prompts.csv
-
-# install requirements
-pip install -r ${BASE}/requirements.txt
-
-python ${BASE}/mmmt_prompt.py --prompt_path $DATA --num_makers 2 --num_trainers 2 --trainer_strategy colossalai_gemini --model opt --critic_model opt --pretrain facebook/opt-350m --critic_pretrain facebook/opt-125m --experience_batch_size 4 --train_batch_size 2
--- a/applications/Chat/examples/requirements.txt
+++ b/applications/Chat/examples/requirements.txt
@ -1,3 +0,0 @@
-pandas>=1.4.1
-sentencepiece
-colossalai==0.3.3
--- a/applications/Chat/examples/train_prompts.py
+++ b/applications/Chat/examples/train_prompts.py
@ -1,249 +0,0 @@
-import argparse
-import warnings
-
-import torch
-import torch.distributed as dist
-from coati.dataset import PromptDataset, SupervisedDataset
-from coati.models.bloom import BLOOMRM, BLOOMActor, BLOOMCritic
-from coati.models.gpt import GPTRM, GPTActor, GPTCritic
-from coati.models.llama import LlamaActor, LlamaCritic, LlamaRM
-from coati.models.opt import OPTRM, OPTActor, OPTCritic
-from coati.trainer import PPOTrainer
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from torch.optim import Adam
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, GPT2Tokenizer, LlamaTokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-
-def main(args):
-    # configure strategy
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="static", initial_scale=2**5)
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    if args.rm_path is not None:
-        warnings.warn("LoRA weights should be merged with the model weights")
-        state_dict = torch.load(args.rm_path, map_location="cpu")
-
-    if args.lora_rank > 0:
-        warnings.warn("Lora is not supported yet.")
-        args.lora_rank = 0
-
-    with strategy.model_init_context():
-        # configure model
-        if args.model == "gpt2":
-            initial_model = GPTActor(pretrained=args.pretrain)
-        elif args.model == "bloom":
-            initial_model = BLOOMActor(pretrained=args.pretrain)
-        elif args.model == "opt":
-            initial_model = OPTActor(pretrained=args.pretrain)
-        elif args.model == "llama":
-            initial_model = LlamaActor(pretrained=args.pretrain)
-        else:
-            raise ValueError(f'Unsupported actor model "{args.model}"')
-
-        if args.rm_model is None:
-            rm_model_name = args.model
-        else:
-            rm_model_name = args.rm_model
-
-        if rm_model_name == "gpt2":
-            reward_model = GPTRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "bloom":
-            reward_model = BLOOMRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "opt":
-            reward_model = OPTRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "llama":
-            reward_model = LlamaRM(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported reward model "{rm_model_name}"')
-
-        if args.rm_path is not None:
-            reward_model.load_state_dict(state_dict, strict=False)
-
-        initial_model.to(torch.bfloat16).to(torch.cuda.current_device())
-        reward_model.to(torch.bfloat16).to(torch.cuda.current_device())
-
-        if args.model == "gpt2":
-            actor = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "bloom":
-            actor = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "opt":
-            actor = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "llama":
-            actor = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported actor model "{args.model}"')
-
-        if rm_model_name == "gpt2":
-            critic = GPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "bloom":
-            critic = BLOOMCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "opt":
-            critic = OPTCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        elif rm_model_name == "llama":
-            critic = LlamaCritic(pretrained=args.rm_pretrain, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported reward model "{rm_model_name}"')
-
-        if args.rm_path is not None:
-            critic.load_state_dict(state_dict, strict=False)
-            del state_dict
-
-        actor.to(torch.bfloat16).to(torch.cuda.current_device())
-        critic.to(torch.bfloat16).to(torch.cuda.current_device())
-
-    # configure optimizer
-    if args.strategy.startswith("colossalai"):
-        actor_optim = HybridAdam(actor.parameters(), lr=args.lr)
-        critic_optim = HybridAdam(critic.parameters(), lr=args.lr)
-    else:
-        actor_optim = Adam(actor.parameters(), lr=args.lr)
-        critic_optim = Adam(critic.parameters(), lr=args.lr)
-
-    # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained(
-            "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
-        )
-        tokenizer.eos_token = "<\s>"
-        tokenizer.pad_token = tokenizer.unk_token
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-    # NOTE: generate() requires padding_side to be "left"
-    tokenizer.padding_side = "left"
-
-    prompt_dataset = PromptDataset(
-        tokenizer=tokenizer,
-        data_path=args.prompt_dataset,
-        max_datasets_size=args.max_datasets_size,
-        max_length=args.max_input_len,
-    )
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        prompt_sampler = DistributedSampler(prompt_dataset, shuffle=True, seed=42, drop_last=True)
-    else:
-        prompt_sampler = None
-    prompt_dataloader = DataLoader(
-        prompt_dataset, shuffle=(prompt_sampler is None), sampler=prompt_sampler, batch_size=args.experience_batch_size
-    )
-
-    pretrain_dataset = SupervisedDataset(
-        tokenizer=tokenizer,
-        data_path=args.pretrain_dataset,
-        max_datasets_size=args.max_datasets_size,
-        max_length=args.max_input_len,
-    )
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        pretrain_sampler = DistributedSampler(pretrain_dataset, shuffle=True, seed=42, drop_last=True)
-    else:
-        pretrain_sampler = None
-    pretrain_dataloader = DataLoader(
-        pretrain_dataset, shuffle=(pretrain_sampler is None), sampler=pretrain_sampler, batch_size=args.ptx_batch_size
-    )
-
-    # NOTE: For small models like opt-1.3b, reward model and initial model are not required to be parallelized.
-    (actor, actor_optim), (critic, critic_optim), reward_model, initial_model = strategy.prepare(
-        (actor, actor_optim), (critic, critic_optim), reward_model, initial_model
-    )
-
-    # configure trainer
-    trainer = PPOTrainer(
-        strategy,
-        actor,
-        critic,
-        reward_model,
-        initial_model,
-        actor_optim,
-        critic_optim,
-        tokenizer=tokenizer,
-        kl_coef=args.kl_coef,
-        ptx_coef=args.ptx_coef,
-        train_batch_size=args.train_batch_size,
-        max_length=args.max_seq_len,
-        use_cache=True,
-        do_sample=True,
-        temperature=1.0,
-        top_k=50,
-        offload_inference_models=args.strategy != "colossalai_gemini",
-    )
-
-    trainer.fit(
-        num_episodes=args.num_episodes,
-        num_collect_steps=args.num_collect_steps,
-        num_update_steps=args.num_update_steps,
-        prompt_dataloader=prompt_dataloader,
-        pretrain_dataloader=pretrain_dataloader,
-        log_dir=args.log_dir,
-        use_wandb=args.use_wandb,
-    )
-
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
-        # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
-        actor.eval()
-    # save model checkpoint after fitting
-    strategy.save_pretrained(actor, path=args.save_path)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        strategy.save_optimizer(
-            actor_optim, "actor_optim_checkpoint_prompts_%d.pt" % (torch.cuda.current_device()), only_rank0=False
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--prompt_dataset", type=str, default=None, help="path to the prompt dataset")
-    parser.add_argument("--pretrain_dataset", type=str, default=None, help="path to the pretrained dataset")
-    parser.add_argument("--max_datasets_size", type=int, default=50000)
-    parser.add_argument(
-        "--strategy",
-        choices=["ddp", "colossalai_gemini", "colossalai_zero2"],
-        default="colossalai_zero2",
-        help="strategy to use",
-    )
-    parser.add_argument("--model", default="gpt2", choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--rm_model", default=None, choices=["gpt2", "bloom", "opt", "llama"])
-    parser.add_argument("--rm_path", type=str, default=None)
-    parser.add_argument("--rm_pretrain", type=str, default=None)
-    parser.add_argument("--save_path", type=str, default="actor_checkpoint_prompts")
-    parser.add_argument("--need_optim_ckpt", type=bool, default=False)
-    parser.add_argument("--num_episodes", type=int, default=10)
-    parser.add_argument("--num_collect_steps", type=int, default=10)
-    parser.add_argument("--num_update_steps", type=int, default=5)
-    parser.add_argument("--train_batch_size", type=int, default=8)
-    parser.add_argument("--ptx_batch_size", type=int, default=1)
-    parser.add_argument("--experience_batch_size", type=int, default=8)
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=1e-7)
-    parser.add_argument("--kl_coef", type=float, default=0.1)
-    parser.add_argument("--ptx_coef", type=float, default=0.9)
-    parser.add_argument("--max_input_len", type=int, default=96)
-    parser.add_argument("--max_seq_len", type=int, default=128)
-    parser.add_argument("--log_dir", default="logs", type=str)
-    parser.add_argument("--use_wandb", default=False, action="store_true")
-    args = parser.parse_args()
-    main(args)
--- a/applications/Chat/examples/train_prompts.sh
+++ b/applications/Chat/examples/train_prompts.sh
@ -1,25 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-# torchrun --standalone --nproc_per_node=2 train_prompts.py prompts.csv --strategy colossalai_zero2
-
-torchrun --standalone --nproc_per_node=2 train_prompts.py \
-    --pretrain_dataset /path/to/data.json \
-    --prompt_dataset /path/to/data.json \
-    --strategy colossalai_zero2 \
-    --num_episodes 1 --num_collect_steps 2 --num_update_steps 1 \
-    --train_batch_size 2
--- a/applications/Chat/examples/train_reward_model.py
+++ b/applications/Chat/examples/train_reward_model.py
@ -1,208 +0,0 @@
-import argparse
-import warnings
-
-import torch
-import torch.distributed as dist
-from coati.dataset import HhRlhfDataset, RmStaticDataset
-from coati.models import LogExpLoss, LogSigLoss
-from coati.models.bloom import BLOOMRM
-from coati.models.gpt import GPTRM
-from coati.models.llama import LlamaRM
-from coati.models.opt import OPTRM
-from coati.trainer import RewardModelTrainer
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from datasets import load_dataset
-from torch.optim import Adam
-from torch.optim.lr_scheduler import CosineAnnealingLR
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-
-from colossalai.nn.optimizer import HybridAdam
-
-
-def train(args):
-    # configure strategy
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto")
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    # configure model
-    if args.lora_rank > 0:
-        warnings.warn("Lora is not supported yet.")
-        args.lora_rank = 0
-
-    with strategy.model_init_context():
-        if args.model == "bloom":
-            model = BLOOMRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "opt":
-            model = OPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "gpt2":
-            model = GPTRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        elif args.model == "llama":
-            model = LlamaRM(pretrained=args.pretrain, lora_rank=args.lora_rank)
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-
-        model.to(torch.bfloat16).to(torch.cuda.current_device())
-
-        if args.model_path is not None:
-            state_dict = torch.load(args.model_path)
-            model.load_state_dict(state_dict)
-
-    # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained(
-            "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
-        )
-        tokenizer.eos_token = "<\s>"
-        tokenizer.pad_token = tokenizer.unk_token
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure optimizer
-    if args.strategy.startswith("colossalai"):
-        optim = HybridAdam(model.parameters(), lr=args.lr)
-    else:
-        optim = Adam(model.parameters(), lr=args.lr)
-
-    # configure loss function
-    if args.loss_fn == "log_sig":
-        loss_fn = LogSigLoss()
-    elif args.loss_fn == "log_exp":
-        loss_fn = LogExpLoss()
-    else:
-        raise ValueError(f'Unsupported loss function "{args.loss_fn}"')
-
-    # prepare for data and dataset
-    if args.subset is not None:
-        data = load_dataset(args.dataset, data_dir=args.subset)
-    else:
-        data = load_dataset(args.dataset)
-
-    train_data = data["train"].select(range(min(args.max_datasets_size, len(data["train"]))))
-    eval_data = data["test"].select(range(min(args.max_datasets_size, len(data["test"]))))
-
-    if args.dataset == "Dahoas/rm-static":
-        train_dataset = RmStaticDataset(train_data, tokenizer, args.max_len)
-        eval_dataset = RmStaticDataset(eval_data, tokenizer, args.max_len)
-    elif args.dataset == "Anthropic/hh-rlhf":
-        train_dataset = HhRlhfDataset(train_data, tokenizer, args.max_len)
-        eval_dataset = HhRlhfDataset(eval_data, tokenizer, args.max_len)
-    else:
-        raise ValueError(f'Unsupported dataset "{args.dataset}"')
-
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        train_sampler = DistributedSampler(
-            train_dataset,
-            shuffle=True,
-            seed=42,
-            drop_last=True,
-            rank=dist.get_rank(),
-            num_replicas=dist.get_world_size(),
-        )
-        eval_sampler = DistributedSampler(
-            eval_dataset,
-            shuffle=True,
-            seed=42,
-            drop_last=True,
-            rank=dist.get_rank(),
-            num_replicas=dist.get_world_size(),
-        )
-    else:
-        train_sampler = None
-        eval_sampler = None
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        shuffle=(train_sampler is None),
-        sampler=train_sampler,
-        batch_size=args.batch_size,
-        pin_memory=True,
-    )
-
-    eval_dataloader = DataLoader(
-        eval_dataset, shuffle=(eval_sampler is None), sampler=eval_sampler, batch_size=args.batch_size, pin_memory=True
-    )
-
-    lr_scheduler = CosineAnnealingLR(optim, train_dataloader.__len__() // 100)
-    strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler))
-    model = strategy_dict["model"]
-    optim = strategy_dict["optimizer"]
-    lr_scheduler = strategy_dict["lr_scheduler"]
-    trainer = RewardModelTrainer(
-        model=model,
-        strategy=strategy,
-        optim=optim,
-        lr_scheduler=lr_scheduler,
-        loss_fn=loss_fn,
-        max_epochs=args.max_epochs,
-    )
-
-    trainer.fit(
-        train_dataloader=train_dataloader,
-        eval_dataloader=eval_dataloader,
-        log_dir=args.log_dir,
-        use_wandb=args.use_wandb,
-    )
-
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
-        # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
-        model.eval()
-    # save model checkpoint after fitting on only rank0
-    state_dict = model.state_dict()
-    torch.save(state_dict, args.save_path)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        strategy.save_optimizer(
-            trainer.optimizer, "rm_optim_checkpoint_%d.pt" % (torch.cuda.current_device()), only_rank0=False
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--strategy", choices=["ddp", "colossalai_gemini", "colossalai_zero2"], default="colossalai_zero2"
-    )
-    parser.add_argument("--model", choices=["gpt2", "bloom", "opt", "llama"], default="bloom")
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--model_path", type=str, default=None)
-    parser.add_argument("--need_optim_ckpt", type=bool, default=False)
-    parser.add_argument(
-        "--dataset", type=str, choices=["Anthropic/hh-rlhf", "Dahoas/rm-static"], default="Dahoas/rm-static"
-    )
-    parser.add_argument("--subset", type=lambda x: None if x == "None" else x, default=None)
-    parser.add_argument("--max_datasets_size", type=int, default=1000000)
-    parser.add_argument("--save_path", type=str, default="rm_ckpt")
-    parser.add_argument("--max_epochs", type=int, default=1)
-    parser.add_argument("--batch_size", type=int, default=1)
-    parser.add_argument("--max_len", type=int, default=512)
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=9e-6)
-    parser.add_argument("--loss_fn", type=str, default="log_sig", choices=["log_sig", "log_exp"])
-    parser.add_argument("--log_dir", default="logs", type=str)
-    parser.add_argument("--use_wandb", default=False, action="store_true")
-    args = parser.parse_args()
-    train(args)
--- a/applications/Chat/examples/train_rm.sh
+++ b/applications/Chat/examples/train_rm.sh
@ -1,25 +0,0 @@
-set_n_least_used_CUDA_VISIBLE_DEVICES() {
-    local n=${1:-"9999"}
-    echo "GPU Memory Usage:"
-    local FIRST_N_GPU_IDS=$(nvidia-smi --query-gpu=memory.used --format=csv |
-        tail -n +2 |
-        nl -v 0 |
-        tee /dev/tty |
-        sort -g -k 2 |
-        awk '{print $1}' |
-        head -n $n)
-    export CUDA_VISIBLE_DEVICES=$(echo $FIRST_N_GPU_IDS | sed 's/ /,/g')
-    echo "Now CUDA_VISIBLE_DEVICES is set to:"
-    echo "CUDA_VISIBLE_DEVICES=$CUDA_VISIBLE_DEVICES"
-}
-
-set_n_least_used_CUDA_VISIBLE_DEVICES 2
-
-torchrun --standalone --nproc_per_node=2 train_reward_model.py \
-    --pretrain 'gpt2' \
-    --model 'gpt2' \
-    --strategy colossalai_zero2 \
-    --loss_fn 'log_exp' \
-    --dataset 'Anthropic/hh-rlhf' \
-    --batch_size 16 \
-    --max_epochs 10
--- a/applications/Chat/examples/train_sft.py
+++ b/applications/Chat/examples/train_sft.py
@ -1,221 +0,0 @@
-import argparse
-import math
-import warnings
-
-import torch
-import torch.distributed as dist
-from coati.dataset import SFTDataset, SupervisedDataset
-from coati.models.bloom import BLOOMActor
-from coati.models.chatglm import ChatGLMActor
-from coati.models.chatglm.chatglm_tokenizer import ChatGLMTokenizer
-from coati.models.gpt import GPTActor
-from coati.models.llama import LlamaActor
-from coati.models.opt import OPTActor
-from coati.trainer import SFTTrainer
-from coati.trainer.strategies import DDPStrategy, GeminiStrategy, LowLevelZeroStrategy
-from datasets import load_dataset
-from torch.optim import Adam
-from torch.utils.data import DataLoader
-from torch.utils.data.distributed import DistributedSampler
-from transformers import AutoTokenizer, BloomTokenizerFast, LlamaTokenizer
-from transformers.models.gpt2.tokenization_gpt2 import GPT2Tokenizer
-from transformers.trainer import get_scheduler
-
-from colossalai.logging import get_dist_logger
-from colossalai.nn.optimizer import HybridAdam
-
-
-def train(args):
-    # configure strategy
-    if args.strategy == "ddp":
-        strategy = DDPStrategy()
-    elif args.strategy == "colossalai_gemini":
-        strategy = GeminiStrategy(placement_policy="auto")
-    elif args.strategy == "colossalai_zero2":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cuda")
-    elif args.strategy == "colossalai_zero2_cpu":
-        strategy = LowLevelZeroStrategy(stage=2, placement_policy="cpu")
-    else:
-        raise ValueError(f'Unsupported strategy "{args.strategy}"')
-
-    # configure model
-    if args.lora_rank > 0:
-        warnings.warn("Lora is not supported yet.")
-        args.lora_rank = 0
-
-    with strategy.model_init_context():
-        if args.model == "bloom":
-            model = BLOOMActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "opt":
-            model = OPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "gpt2":
-            model = GPTActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "llama":
-            model = LlamaActor(pretrained=args.pretrain, lora_rank=args.lora_rank, checkpoint=args.grad_checkpoint)
-        elif args.model == "chatglm":
-            model = ChatGLMActor(pretrained=args.pretrain)
-        else:
-            raise ValueError(f'Unsupported model "{args.model}"')
-
-        model.to(torch.bfloat16).to(torch.cuda.current_device())
-
-    # configure tokenizer
-    if args.model == "gpt2":
-        tokenizer = GPT2Tokenizer.from_pretrained("gpt2" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "bloom":
-        tokenizer = BloomTokenizerFast.from_pretrained(
-            "bigscience/bloom-560m" if args.tokenizer is None else args.tokenizer
-        )
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "opt":
-        tokenizer = AutoTokenizer.from_pretrained("facebook/opt-350m" if args.tokenizer is None else args.tokenizer)
-        tokenizer.pad_token = tokenizer.eos_token
-    elif args.model == "llama":
-        tokenizer = LlamaTokenizer.from_pretrained(
-            "hf-internal-testing/llama-tokenizer" if args.tokenizer is None else args.tokenizer
-        )
-        tokenizer.eos_token = "<\s>"
-        tokenizer.pad_token = tokenizer.unk_token
-    elif args.model == "chatglm":
-        tokenizer = ChatGLMTokenizer.from_pretrained(
-            "THUDM/chatglm-6b" if args.tokenizer is None else args.tokenizer, trust_remote_code=True
-        )
-    else:
-        raise ValueError(f'Unsupported model "{args.model}"')
-
-    # configure optimizer
-    if args.strategy.startswith("colossalai"):
-        optim = HybridAdam(model.parameters(), lr=args.lr, clipping_norm=1.0)
-    else:
-        optim = Adam(model.parameters(), lr=args.lr)
-
-    # configure dataset
-    if args.dataset == "yizhongw/self_instruct":
-        train_data = load_dataset(args.dataset, "super_natural_instructions", split="train")
-        eval_data = load_dataset(args.dataset, "super_natural_instructions", split="test")
-
-        if args.max_datasets_size is not None:
-            train_data = train_data.select(range(min(args.max_datasets_size, len(train_data))))
-            eval_data = eval_data.select(range(min(args.max_datasets_size, len(eval_data))))
-
-        train_dataset = SFTDataset(train_data, tokenizer, args.max_len)
-        eval_dataset = SFTDataset(eval_data, tokenizer, args.max_len)
-
-    else:
-        train_dataset = SupervisedDataset(
-            tokenizer=tokenizer,
-            data_path=args.dataset,
-            max_datasets_size=args.max_datasets_size,
-            max_length=args.max_len,
-        )
-        eval_dataset = None
-
-    if dist.is_initialized() and dist.get_world_size() > 1:
-        train_sampler = DistributedSampler(
-            train_dataset,
-            shuffle=True,
-            seed=42,
-            drop_last=True,
-            rank=dist.get_rank(),
-            num_replicas=dist.get_world_size(),
-        )
-        if eval_dataset is not None:
-            eval_sampler = DistributedSampler(
-                eval_dataset,
-                shuffle=False,
-                seed=42,
-                drop_last=False,
-                rank=dist.get_rank(),
-                num_replicas=dist.get_world_size(),
-            )
-    else:
-        train_sampler = None
-        eval_sampler = None
-
-    train_dataloader = DataLoader(
-        train_dataset,
-        shuffle=(train_sampler is None),
-        sampler=train_sampler,
-        batch_size=args.batch_size,
-        pin_memory=True,
-    )
-    if eval_dataset is not None:
-        eval_dataloader = DataLoader(
-            eval_dataset,
-            shuffle=(eval_sampler is None),
-            sampler=eval_sampler,
-            batch_size=args.batch_size,
-            pin_memory=True,
-        )
-    else:
-        eval_dataloader = None
-
-    num_update_steps_per_epoch = len(train_dataloader) // args.accumulation_steps
-    max_steps = math.ceil(args.max_epochs * num_update_steps_per_epoch)
-    lr_scheduler = get_scheduler(
-        "cosine", optim, num_warmup_steps=math.ceil(max_steps * 0.03), num_training_steps=max_steps
-    )
-    strategy_dict = strategy.prepare(dict(model=model, optimizer=optim, lr_scheduler=lr_scheduler))
-    model = strategy_dict["model"]
-    optim = strategy_dict["optimizer"]
-    lr_scheduler = strategy_dict["lr_scheduler"]
-    trainer = SFTTrainer(
-        model=model,
-        strategy=strategy,
-        optim=optim,
-        lr_scheduler=lr_scheduler,
-        max_epochs=args.max_epochs,
-        accumulation_steps=args.accumulation_steps,
-    )
-
-    logger = get_dist_logger()
-    trainer.fit(
-        train_dataloader=train_dataloader,
-        eval_dataloader=eval_dataloader,
-        logger=logger,
-        log_dir=args.log_dir,
-        use_wandb=args.use_wandb,
-    )
-
-    if args.lora_rank > 0 and args.merge_lora_weights:
-        from coati.models.lora import LORA_MANAGER
-
-        # NOTE: set model to eval to merge LoRA weights
-        LORA_MANAGER.merge_weights = True
-        model.eval()
-    # save model checkpoint after fitting on only rank0
-    strategy.save_pretrained(model, path=args.save_path, tokenizer=tokenizer)
-    # save optimizer checkpoint on all ranks
-    if args.need_optim_ckpt:
-        strategy.save_optimizer(
-            trainer.optimizer, "rm_optim_checkpoint_%d.pt" % (torch.cuda.current_device()), only_rank0=False
-        )
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--strategy",
-        choices=["ddp", "colossalai_gemini", "colossalai_zero2", "colossalai_zero2_cpu"],
-        default="colossalai_zero2",
-    )
-    parser.add_argument("--model", choices=["gpt2", "bloom", "opt", "llama", "chatglm"], default="bloom")
-    parser.add_argument("--tokenizer", type=str, default=None)
-    parser.add_argument("--pretrain", type=str, default=None)
-    parser.add_argument("--dataset", type=str, default=None)
-    parser.add_argument("--max_datasets_size", type=int, default=None)
-    parser.add_argument("--save_path", type=str, default="output")
-    parser.add_argument("--need_optim_ckpt", type=bool, default=False)
-    parser.add_argument("--max_epochs", type=int, default=3)
-    parser.add_argument("--batch_size", type=int, default=4)
-    parser.add_argument("--max_len", type=int, default=512)
-    parser.add_argument("--lora_rank", type=int, default=0, help="low-rank adaptation matrices rank")
-    parser.add_argument("--merge_lora_weights", type=bool, default=True)
-    parser.add_argument("--lr", type=float, default=5e-6)
-    parser.add_argument("--accumulation_steps", type=int, default=8)
-    parser.add_argument("--log_dir", default="logs", type=str)
-    parser.add_argument("--use_wandb", default=False, action="store_true")
-    parser.add_argument("--grad_checkpoint", default=False, action="store_true")
-    args = parser.parse_args()
-    train(args)
--- a/Show More
+++ b/Show More