diff --git a/.compatibility b/.compatibility index a918cb162..d90a74b58 100644 --- a/.compatibility +++ b/.compatibility @@ -1,2 +1 @@ -2.0.0-11.7.0 -2.1.0-11.8.0 +2.1.0-12.1.0 diff --git a/.cuda_ext.json b/.cuda_ext.json index eba19cf05..b8269f837 100644 --- a/.cuda_ext.json +++ b/.cuda_ext.json @@ -1,16 +1,16 @@ { "build": [ { - "torch_command": "pip install torch==1.12.1+cu102 torchvision==0.13.1+cu102 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu102", - "cuda_image": "hpcaitech/cuda-conda:10.2" + "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu121", + "cuda_image": "hpcaitech/cuda-conda:12.1" }, { - "torch_command": "pip install torch==1.12.1+cu113 torchvision==0.13.1+cu113 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu113", - "cuda_image": "hpcaitech/cuda-conda:11.3" + "torch_command": "pip install torch==2.1.0 torchvision==0.16.0 torchaudio==2.1.0 --index-url https://download.pytorch.org/whl/cu118", + "cuda_image": "hpcaitech/cuda-conda:11.8" }, { - "torch_command": "pip install torch==1.12.1+cu116 torchvision==0.13.1+cu116 torchaudio==0.12.1 --extra-index-url https://download.pytorch.org/whl/cu116", - "cuda_image": "hpcaitech/cuda-conda:11.6" + "torch_command": "pip install torch==2.0.0 torchvision==0.15.1 torchaudio==2.0.1", + "cuda_image": "hpcaitech/cuda-conda:11.7" } ] } diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md index 0dc1262b0..d67cd9fcc 100644 --- a/.github/pull_request_template.md +++ b/.github/pull_request_template.md @@ -3,6 +3,7 @@ - [ ] I have created an issue for this PR for traceability - [ ] The title follows the standard format: `[doc/gemini/tensor/...]: A concise description` - [ ] I have added relevant tags if possible for us to better distinguish different PRs +- [ ] I have installed pre-commit: `pip install pre-commit && pre-commit install` ## 🚨 Issue number diff --git a/.github/workflows/build_on_pr.yml b/.github/workflows/build_on_pr.yml index 2cad504f3..5bdadca78 100644 --- a/.github/workflows/build_on_pr.yml +++ b/.github/workflows/build_on_pr.yml @@ -117,7 +117,7 @@ jobs: cd TensorNVMe conda install cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - name: Store TensorNVMe Cache run: | diff --git a/.github/workflows/build_on_schedule.yml b/.github/workflows/build_on_schedule.yml index ae1a5275e..e560d0c00 100644 --- a/.github/workflows/build_on_schedule.yml +++ b/.github/workflows/build_on_schedule.yml @@ -44,7 +44,7 @@ jobs: cd TensorNVMe conda install cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 if: steps.check-avai.outputs.avai == 'true' @@ -67,7 +67,6 @@ jobs: --durations=0 \ tests/ env: - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/.github/workflows/compatiblity_test_on_dispatch.yml b/.github/workflows/compatiblity_test_on_dispatch.yml index 508321299..95a94c27b 100644 --- a/.github/workflows/compatiblity_test_on_dispatch.yml +++ b/.github/workflows/compatiblity_test_on_dispatch.yml @@ -50,7 +50,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: ${{ matrix.container }} - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 120 steps: - name: Install dependencies @@ -66,7 +66,7 @@ jobs: cd TensorNVMe apt update && apt install -y cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} @@ -83,13 +83,12 @@ jobs: fi - name: Install Colossal-AI run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt - name: Unit Testing run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/.github/workflows/compatiblity_test_on_pr.yml b/.github/workflows/compatiblity_test_on_pr.yml index cc17c66f9..aef4816ef 100644 --- a/.github/workflows/compatiblity_test_on_pr.yml +++ b/.github/workflows/compatiblity_test_on_pr.yml @@ -41,7 +41,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: ${{ matrix.container }} - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 120 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-test-${{ matrix.container }} @@ -60,7 +60,7 @@ jobs: cd TensorNVMe apt update && apt install -y cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} @@ -78,13 +78,12 @@ jobs: - name: Install Colossal-AI run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt - name: Unit Testing run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/.github/workflows/compatiblity_test_on_schedule.yml b/.github/workflows/compatiblity_test_on_schedule.yml index 158fe751b..3dc8a5a32 100644 --- a/.github/workflows/compatiblity_test_on_schedule.yml +++ b/.github/workflows/compatiblity_test_on_schedule.yml @@ -38,7 +38,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: ${{ matrix.container }} - options: --gpus all --rm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny + options: --gpus all --rm -v /dev/shm -v /data/scratch/cifar-10:/data/scratch/cifar-10 -v /data/scratch/llama-tiny:/data/scratch/llama-tiny timeout-minutes: 120 steps: - name: Install dependencies @@ -56,7 +56,7 @@ jobs: cd TensorNVMe apt update && apt install -y cmake pip install -r requirements.txt - pip install -v . + DISABLE_URING=1 pip install -v . - uses: actions/checkout@v2 with: ssh-key: ${{ secrets.SSH_KEY_FOR_CI }} @@ -75,15 +75,14 @@ jobs: - name: Install Colossal-AI run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . pip install -r requirements/requirements-test.txt - name: Unit Testing run: | - PYTHONPATH=$PWD pytest tests + PYTHONPATH=$PWD pytest --durations=0 tests env: DATA: /data/scratch/cifar-10 - NCCL_SHM_DISABLE: 1 LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64 LLAMA_PATH: /data/scratch/llama-tiny diff --git a/.github/workflows/cuda_ext_check_before_merge.yml b/.github/workflows/cuda_ext_check_before_merge.yml index 686f0f395..14f53bd69 100644 --- a/.github/workflows/cuda_ext_check_before_merge.yml +++ b/.github/workflows/cuda_ext_check_before_merge.yml @@ -51,4 +51,4 @@ jobs: - name: Build run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . diff --git a/.github/workflows/doc_test_on_pr.yml b/.github/workflows/doc_test_on_pr.yml index 51238905e..8afc46b87 100644 --- a/.github/workflows/doc_test_on_pr.yml +++ b/.github/workflows/doc_test_on_pr.yml @@ -89,7 +89,7 @@ jobs: - name: Install ColossalAI run: | source activate pytorch - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . - name: Test the Doc run: | diff --git a/.github/workflows/doc_test_on_schedule.yml b/.github/workflows/doc_test_on_schedule.yml index b3536184d..e2491e460 100644 --- a/.github/workflows/doc_test_on_schedule.yml +++ b/.github/workflows/doc_test_on_schedule.yml @@ -32,7 +32,7 @@ jobs: - name: Install ColossalAI run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . - name: Install Doc Test Requirements run: | diff --git a/.github/workflows/example_check_on_dispatch.yml b/.github/workflows/example_check_on_dispatch.yml index bba321fd2..d877b06ce 100644 --- a/.github/workflows/example_check_on_dispatch.yml +++ b/.github/workflows/example_check_on_dispatch.yml @@ -46,19 +46,17 @@ jobs: matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/examples-data:/data/ + options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 15 steps: - name: 📚 Checkout uses: actions/checkout@v3 - name: Install Colossal-AI run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . - name: Test the example run: | dir=${{ matrix.directory }} echo "Testing ${dir} now" cd "${PWD}/examples/${dir}" bash test_ci.sh - env: - NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/example_check_on_pr.yml b/.github/workflows/example_check_on_pr.yml index fcff8e569..6170628e1 100644 --- a/.github/workflows/example_check_on_pr.yml +++ b/.github/workflows/example_check_on_pr.yml @@ -78,7 +78,7 @@ jobs: matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/examples-data:/data/ + options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 20 concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-run-example-${{ matrix.directory }} @@ -88,12 +88,10 @@ jobs: - name: Install Colossal-AI run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . - name: Test the example run: | example_dir=${{ matrix.directory }} cd "${PWD}/examples/${example_dir}" bash test_ci.sh - env: - NCCL_SHM_DISABLE: 1 diff --git a/.github/workflows/example_check_on_schedule.yml b/.github/workflows/example_check_on_schedule.yml index abb947949..2588ac824 100644 --- a/.github/workflows/example_check_on_schedule.yml +++ b/.github/workflows/example_check_on_schedule.yml @@ -35,6 +35,7 @@ jobs: matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}} container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 + options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm timeout-minutes: 10 steps: - name: 📚 Checkout @@ -42,7 +43,7 @@ jobs: - name: Install Colossal-AI run: | - CUDA_EXT=1 pip install -v . + BUILD_EXT=1 pip install -v . - name: Traverse all files run: | @@ -50,8 +51,6 @@ jobs: echo "Testing ${example_dir} now" cd "${PWD}/examples/${example_dir}" bash test_ci.sh - env: - NCCL_SHM_DISABLE: 1 - name: Notify Lark id: message-preparation diff --git a/.github/workflows/post_commit.yml b/.github/workflows/post_commit.yml deleted file mode 100644 index 1bbc0d2f5..000000000 --- a/.github/workflows/post_commit.yml +++ /dev/null @@ -1,97 +0,0 @@ -name: post-commit - -on: - pull_request: - types: - - closed - -jobs: - # this job will run after a PR is merged to run pre-commit on any changed file - # so that the user does not need to learn pre-commit and pre-commit can still - # be auto-executed by the workflow - pre-commit: - runs-on: ubuntu-latest - if: github.event.pull_request.merged == true && github.repository == 'hpcaitech/ColossalAI' - steps: - - uses: actions/checkout@v2 - with: - fetch-depth: 0 - ref: ${{ github.event.pull_request.head.sha }} - - # the PR branch and the hpcaitech/colossal-ai main branch - # must share a common commit, we need to locate that commit, - # which is the commit checked-out or forked when the PR branch is created - # such that we can look for files changed since that commit - - name: Locate base commit - id: locate-base-sha - run: | - curBranch=$(git rev-parse --abbrev-ref HEAD) - commonCommit=$(git merge-base origin/main $curBranch) - echo $commonCommit - echo "baseSHA=$commonCommit" >> $GITHUB_OUTPUT - - - name: Find the changed files - id: find-changed-files - uses: tj-actions/changed-files@v35 - with: - base_sha: ${{ steps.locate-base-sha.outputs.baseSHA }} - - - name: List all changed files - run: | - for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do - echo "$file was changed" - done - - # check out the main branch - - uses: actions/checkout@v2 - with: - ref: 'main' - - - uses: actions/setup-python@v3 - - - name: Cache pre-commit hooks - uses: actions/cache@v3 - with: - path: ~/.cache/pre-commit - key: ${{ runner.os }}-pre-commit-hooks - - - name: Set up pre-commit - run: | - pip install pre-commit - pre-commit install - - # run pre-commit on changed files - - name: Run Pre-commit - run: | - for file in ${{ steps.find-changed-files.outputs.all_changed_files }}; do - pre-commit run --files $file || true - done - - # create commit for pre-commit - # when all files are well formatted, there is no need to create a commit - # therefore, this step will produce an error, which should be allowed - - name: Create commits - id: commit - continue-on-error: true - run: | - git config --global user.name 'github-actions' - git config --global user.email 'github-actions@github.com' - git remote set-url origin https://x-access-token:${{ secrets.GITHUB_TOKEN }}@github.com/${{ github.repository }} - git add -A - git commit -am "[format] applied code formatting on changed files in pull request ${{ github.event.pull_request.number }}" - - # create pull request - - name: Create Pull Request - if: steps.commit.outcome == 'success' - id: cpr - uses: peter-evans/create-pull-request@v4 - with: - branch: pre-commit-${{ github.event.pull_request.number }} - title: "[format] applied code formatting on changed files in PR ${{ github.event.pull_request.number }}" - - - name: Enable Auto-merge for the New PR - if: steps.commit.outcome == 'success' - uses: peter-evans/enable-pull-request-automerge@v2 - with: - pull-request-number: ${{ steps.cpr.outputs.pull-request-number }} - merge-method: squash diff --git a/.github/workflows/run_chatgpt_examples.yml b/.github/workflows/run_chatgpt_examples.yml index bb0ceb4a8..ba997f144 100644 --- a/.github/workflows/run_chatgpt_examples.yml +++ b/.github/workflows/run_chatgpt_examples.yml @@ -19,8 +19,8 @@ jobs: runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/github_actions/chat:/data/scratch/github_actions/chat --shm-size=10.24gb - timeout-minutes: 30 + options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb + timeout-minutes: 60 defaults: run: shell: bash @@ -28,26 +28,35 @@ jobs: - name: Checkout ColossalAI uses: actions/checkout@v2 + - name: Install Colossal-AI + run: | + BUILD_EXT=1 pip install -v -e . + - name: Install ChatGPT run: | - cd applications/Chat + cd applications/ColossalChat pip install -v . + export BUILD_EXT=1 pip install -r examples/requirements.txt - name: Install Transformers run: | - pip install transformers==4.30.2 + pip install transformers==4.34.1 - name: Execute Examples run: | - cd applications/Chat + cd applications/ColossalChat rm -rf ~/.cache/colossalai - ./tests/test_inference.sh - ./tests/test_benchmarks.sh + mkdir models + mkdir sft_data + mkdir prompt_data + mkdir preference_data + ./tests/test_data_preparation.sh ./tests/test_train.sh env: NCCL_SHM_DISABLE: 1 MAX_JOBS: 8 - SFT_DATASET: /data/scratch/github_actions/chat/data.json - PROMPT_DATASET: /data/scratch/github_actions/chat/prompts_en.jsonl - PRETRAIN_DATASET: /data/scratch/github_actions/chat/alpaca_data.json + PRETRAINED_MODEL_PATH: ./models + SFT_DATASET: ./sft_data + PROMPT_DATASET: ./prompt_data + PREFERENCE_DATASET: ./preference_data diff --git a/.github/workflows/run_chatgpt_unit_tests.yml b/.github/workflows/run_chatgpt_unit_tests.yml index 7986889e0..1d8a53e4f 100644 --- a/.github/workflows/run_chatgpt_unit_tests.yml +++ b/.github/workflows/run_chatgpt_unit_tests.yml @@ -21,7 +21,7 @@ jobs: runs-on: [self-hosted, gpu] container: image: hpcaitech/pytorch-cuda:2.1.0-12.1.0 - options: --gpus all --rm -v /data/scratch/chatgpt:/data/scratch/chatgpt + options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data timeout-minutes: 30 defaults: run: @@ -32,15 +32,17 @@ jobs: - name: Install ChatGPT run: | - cd applications/Chat + cd applications/ColossalChat pip install -v . - pip install -r requirements-test.txt + pip install pytest - name: Execute Unit Testing run: | - cd applications/Chat + cd applications/ColossalChat rm -rf ~/.cache/colossalai pytest tests/ + cd ./tests + ./test_templating.sh env: NCCL_SHM_DISABLE: 1 MAX_JOBS: 8 diff --git a/.gitignore b/.gitignore index 81113fa99..8bc74b4c8 100644 --- a/.gitignore +++ b/.gitignore @@ -159,3 +159,7 @@ coverage.xml # ignore testmon and coverage files .coverage .testmondata* + +# log, test files - ColossalChat +applications/ColossalChat/logs +applications/ColossalChat/tests/logs diff --git a/LICENSE b/LICENSE index bacb03e72..47197afe6 100644 --- a/LICENSE +++ b/LICENSE @@ -551,4 +551,4 @@ Copyright 2021- HPC-AI Technology Inc. All rights reserved. AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN - THE SOFTWARE. + THE SOFTWARE. diff --git a/README.md b/README.md index 442e6bbcd..26776bdf6 100644 --- a/README.md +++ b/README.md @@ -25,16 +25,15 @@ ## Latest News +* [2024/03] [314 Billion Parameter Grok-1 Inference Accelerated by 3.8x, Efficient and Easy-to-Use PyTorch+HuggingFace version is Here](https://hpc-ai.com/blog/314-billion-parameter-grok-1-inference-accelerated-by-3.8x-efficient-and-easy-to-use-pytorchhuggingface-version-is-here) +* [2024/03] [Open-Sora: Revealing Complete Model Parameters, Training Details, and Everything for Sora-like Video Generation Models](https://hpc-ai.com/blog/open-sora-v1.0) +* [2024/03] [Open-Sora:Sora Replication Solution with 46% Cost Reduction, Sequence Expansion to Nearly a Million](https://hpc-ai.com/blog/open-sora) * [2024/01] [Inference Performance Improved by 46%, Open Source Solution Breaks the Length Limit of LLM for Multi-Round Conversations](https://hpc-ai.com/blog/Colossal-AI-SwiftInfer) * [2024/01] [Construct Refined 13B Private Model With Just $5000 USD, Upgraded Colossal-AI Llama-2 Open Source](https://hpc-ai.com/blog/colossal-llama-2-13b) * [2023/11] [Enhanced MoE Parallelism, Open-source MoE Model Training Can Be 9 Times More Efficient](https://www.hpc-ai.tech/blog/enhanced-moe-parallelism-open-source-moe-model-training-can-be-9-times-more-efficient) * [2023/09] [One Half-Day of Training Using a Few Hundred Dollars Yields Similar Results to Mainstream Large Models, Open-Source and Commercial-Free Domain-Specific LLM Solution](https://www.hpc-ai.tech/blog/one-half-day-of-training-using-a-few-hundred-dollars-yields-similar-results-to-mainstream-large-models-open-source-and-commercial-free-domain-specific-llm-solution) * [2023/09] [70 Billion Parameter LLaMA2 Model Training Accelerated by 195%](https://www.hpc-ai.tech/blog/70b-llama2-training) * [2023/07] [HPC-AI Tech Raises 22 Million USD in Series A Funding](https://www.hpc-ai.tech/blog/hpc-ai-tech-raises-22-million-usd-in-series-a-funding-to-fuel-team-expansion-and-business-growth) -* [2023/07] [65B Model Pretraining Accelerated by 38%, Best Practices for Building LLaMA-Like Base Models Open-Source](https://www.hpc-ai.tech/blog/large-model-pretraining) -* [2023/03] [ColossalChat: An Open-Source Solution for Cloning ChatGPT With a Complete RLHF Pipeline](https://medium.com/@yangyou_berkeley/colossalchat-an-open-source-solution-for-cloning-chatgpt-with-a-complete-rlhf-pipeline-5edf08fb538b) -* [2023/03] [Intel and Colossal-AI Partner to Deliver Cost-Efficient Open-Source Solution for Protein Folding Structure Prediction](https://www.hpc-ai.tech/blog/intel-habana) -* [2023/03] [AWS and Google Fund Colossal-AI with Startup Cloud Programs](https://www.hpc-ai.tech/blog/aws-and-google-fund-colossal-ai-with-startup-cloud-programs) ## Table of Contents