mirror of
https://github.com/hpcaitech/ColossalAI.git
synced 2026-04-11 14:43:10 +00:00
* benchmark gpt2 * fix fix fix fix * [doc] fix typo in Colossal-LLaMA-2/README.md (#5247) * [workflow] fixed build CI (#5240) * [workflow] fixed build CI * polish * polish * polish * polish * polish * [ci] fixed booster test (#5251) * [ci] fixed booster test * [ci] fixed booster test * [ci] fixed booster test * [ci] fixed ddp test (#5254) * [ci] fixed ddp test * polish * fix typo in applications/ColossalEval/README.md (#5250) * [ci] fix shardformer tests. (#5255) * fix ci fix * revert: revert p2p * feat: add enable_metadata_cache option * revert: enable t5 tests --------- Co-authored-by: Wenhao Chen <cwher@outlook.com> * [doc] fix doc typo (#5256) * [doc] fix annotation display * [doc] fix llama2 doc * [hotfix]: add pp sanity check and fix mbs arg (#5268) * fix: fix misleading mbs arg * feat: add pp sanity check * fix: fix 1f1b sanity check * [workflow] fixed incomplete bash command (#5272) * [workflow] fixed oom tests (#5275) * [workflow] fixed oom tests * polish * polish * polish * [ci] fix test_hybrid_parallel_plugin_checkpoint_io.py (#5276) * fix ci fix * fix test * revert: revert p2p * feat: add enable_metadata_cache option * revert: enable t5 tests * fix --------- Co-authored-by: Wenhao Chen <cwher@outlook.com> * [shardformer] hybridparallelplugin support gradients accumulation. (#5246) * support gradients acc fix fix fix fix fix fix fix fix fix fix fix fix fix * fix fix * fix fix fix * [hotfix] Fix ShardFormer test execution path when using sequence parallelism (#5230) * fix auto loading gpt2 tokenizer (#5279) * [doc] add llama2-13B disyplay (#5285) * Update README.md * fix 13b typo --------- Co-authored-by: binmakeswell <binmakeswell@gmail.com> * fix llama pretrain (#5287) * fix * fix * fix fix * fix fix fix * fix fix * benchmark gpt2 * fix fix fix fix * [workflow] fixed build CI (#5240) * [workflow] fixed build CI * polish * polish * polish * polish * polish * [ci] fixed booster test (#5251) * [ci] fixed booster test * [ci] fixed booster test * [ci] fixed booster test * fix fix * fix fix fix * fix * fix fix fix fix fix * fix * Update shardformer.py --------- Co-authored-by: digger yu <digger-yu@outlook.com> Co-authored-by: Frank Lee <somerlee.9@gmail.com> Co-authored-by: Wenhao Chen <cwher@outlook.com> Co-authored-by: binmakeswell <binmakeswell@gmail.com> Co-authored-by: Zhongkai Zhao <kanezz620@gmail.com> Co-authored-by: Michelle <97082656+MichelleMa8@users.noreply.github.com> Co-authored-by: Desperado-Jia <502205863@qq.com>
86 lines
2.9 KiB
YAML
86 lines
2.9 KiB
YAML
name: Build on Schedule
|
|
|
|
on:
|
|
schedule:
|
|
# run at 00:00 of every Sunday
|
|
- cron: "0 0 * * *"
|
|
workflow_dispatch:
|
|
|
|
jobs:
|
|
build:
|
|
name: Build and Test Colossal-AI
|
|
if: github.repository == 'hpcaitech/ColossalAI'
|
|
runs-on: [self-hosted, gpu]
|
|
container:
|
|
image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
|
|
options: --gpus all --rm -v /dev/shm -v /data/scratch/llama-tiny:/data/scratch/llama-tiny
|
|
timeout-minutes: 90
|
|
steps:
|
|
- name: Check GPU Availability # ensure all GPUs have enough memory
|
|
id: check-avai
|
|
run: |
|
|
avai=true
|
|
ngpu=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
|
endIndex=$(($ngpu-1))
|
|
for i in $(seq 0 $endIndex);
|
|
do
|
|
gpu_used=$(nvidia-smi -i $i --query-gpu=memory.used --format=csv,noheader,nounits)
|
|
[ "$gpu_used" -gt "2000" ] && avai=false
|
|
done
|
|
|
|
echo "GPU is available: $avai"
|
|
echo "avai=$avai" >> $GITHUB_OUTPUT
|
|
|
|
- uses: actions/checkout@v2
|
|
if: steps.check-avai.outputs.avai == 'true'
|
|
with:
|
|
repository: hpcaitech/TensorNVMe
|
|
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
|
path: TensorNVMe
|
|
|
|
- name: Install tensornvme
|
|
if: steps.check-avai.outputs.avai == 'true'
|
|
run: |
|
|
cd TensorNVMe
|
|
conda install cmake
|
|
pip install -r requirements.txt
|
|
pip install -v .
|
|
|
|
- uses: actions/checkout@v2
|
|
if: steps.check-avai.outputs.avai == 'true'
|
|
with:
|
|
ssh-key: ${{ secrets.SSH_KEY_FOR_CI }}
|
|
|
|
- name: Install Colossal-AI
|
|
if: steps.check-avai.outputs.avai == 'true'
|
|
run: |
|
|
[ ! -z "$(ls -A /github/home/cuda_ext_cache/)" ] && cp -r /github/home/cuda_ext_cache/* /__w/ColossalAI/ColossalAI/
|
|
BUILD_EXT=1 pip install -v -e .
|
|
cp -r /__w/ColossalAI/ColossalAI/build /github/home/cuda_ext_cache/
|
|
pip install -r requirements/requirements-test.txt
|
|
|
|
- name: Unit Testing
|
|
if: steps.check-avai.outputs.avai == 'true'
|
|
run: |
|
|
PYTHONPATH=$PWD pytest \
|
|
-m "not largedist" \
|
|
--durations=0 \
|
|
tests/
|
|
env:
|
|
NCCL_SHM_DISABLE: 1
|
|
LD_LIBRARY_PATH: /github/home/.tensornvme/lib:/usr/local/nvidia/lib:/usr/local/nvidia/lib64
|
|
LLAMA_PATH: /data/scratch/llama-tiny
|
|
|
|
- name: Notify Lark
|
|
id: message-preparation
|
|
if: ${{ failure() }}
|
|
run: |
|
|
url=$SERVER_URL/$REPO/actions/runs/$RUN_ID
|
|
msg="Scheduled Build and Test failed, please visit $url for details"
|
|
echo $msg
|
|
python .github/workflows/scripts/send_message_to_lark.py -m "$msg" -u $WEBHOOK_URL
|
|
env:
|
|
SERVER_URL: ${{github.server_url }}
|
|
REPO: ${{ github.repository }}
|
|
RUN_ID: ${{ github.run_id }}
|
|
WEBHOOK_URL: ${{ secrets.LARK_NOTIFICATION_WEBHOOK_URL }} |